blob: a66aaa2010a3a10da0c200209c115c167c7c22bb [file] [log] [blame]
Reid Spencer5f016e22007-07-11 17:01:13 +00001//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2//
3// The LLVM Compiler Infrastructure
4//
Chris Lattner0bc735f2007-12-29 19:59:25 +00005// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
Reid Spencer5f016e22007-07-11 17:01:13 +00007//
8//===----------------------------------------------------------------------===//
9//
10// This file implements the NumericLiteralParser, CharLiteralParser, and
11// StringLiteralParser interfaces.
12//
13//===----------------------------------------------------------------------===//
14
15#include "clang/Lex/LiteralSupport.h"
16#include "clang/Lex/Preprocessor.h"
Reid Spencer5f016e22007-07-11 17:01:13 +000017#include "clang/Basic/Diagnostic.h"
Chris Lattner136f93a2007-07-16 06:55:01 +000018#include "clang/Basic/TargetInfo.h"
Reid Spencer5f016e22007-07-11 17:01:13 +000019#include "llvm/ADT/StringExtras.h"
20using namespace clang;
21
22/// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
23/// not valid.
24static int HexDigitValue(char C) {
25 if (C >= '0' && C <= '9') return C-'0';
26 if (C >= 'a' && C <= 'f') return C-'a'+10;
27 if (C >= 'A' && C <= 'F') return C-'A'+10;
28 return -1;
29}
30
31/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
32/// either a character or a string literal.
33static unsigned ProcessCharEscape(const char *&ThisTokBuf,
34 const char *ThisTokEnd, bool &HadError,
35 SourceLocation Loc, bool IsWide,
36 Preprocessor &PP) {
37 // Skip the '\' char.
38 ++ThisTokBuf;
39
40 // We know that this character can't be off the end of the buffer, because
41 // that would have been \", which would not have been the end of string.
42 unsigned ResultChar = *ThisTokBuf++;
43 switch (ResultChar) {
44 // These map to themselves.
45 case '\\': case '\'': case '"': case '?': break;
46
47 // These have fixed mappings.
48 case 'a':
49 // TODO: K&R: the meaning of '\\a' is different in traditional C
50 ResultChar = 7;
51 break;
52 case 'b':
53 ResultChar = 8;
54 break;
55 case 'e':
56 PP.Diag(Loc, diag::ext_nonstandard_escape, "e");
57 ResultChar = 27;
58 break;
59 case 'f':
60 ResultChar = 12;
61 break;
62 case 'n':
63 ResultChar = 10;
64 break;
65 case 'r':
66 ResultChar = 13;
67 break;
68 case 't':
69 ResultChar = 9;
70 break;
71 case 'v':
72 ResultChar = 11;
73 break;
74
75 //case 'u': case 'U': // FIXME: UCNs.
76 case 'x': { // Hex escape.
77 ResultChar = 0;
78 if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
79 PP.Diag(Loc, diag::err_hex_escape_no_digits);
80 HadError = 1;
81 break;
82 }
83
84 // Hex escapes are a maximal series of hex digits.
85 bool Overflow = false;
86 for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
87 int CharVal = HexDigitValue(ThisTokBuf[0]);
88 if (CharVal == -1) break;
Chris Lattnerb8128142007-09-03 18:28:41 +000089 Overflow |= (ResultChar & 0xF0000000) ? true : false; // About to shift out a digit?
Reid Spencer5f016e22007-07-11 17:01:13 +000090 ResultChar <<= 4;
91 ResultChar |= CharVal;
92 }
93
94 // See if any bits will be truncated when evaluated as a character.
Chris Lattner98be4942008-03-05 18:54:05 +000095 unsigned CharWidth = PP.getTargetInfo().getCharWidth(IsWide);
Ted Kremenek9c728dc2007-12-12 22:39:36 +000096
Reid Spencer5f016e22007-07-11 17:01:13 +000097 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
98 Overflow = true;
99 ResultChar &= ~0U >> (32-CharWidth);
100 }
101
102 // Check for overflow.
103 if (Overflow) // Too many digits to fit in
104 PP.Diag(Loc, diag::warn_hex_escape_too_large);
105 break;
106 }
107 case '0': case '1': case '2': case '3':
108 case '4': case '5': case '6': case '7': {
109 // Octal escapes.
110 --ThisTokBuf;
111 ResultChar = 0;
112
113 // Octal escapes are a series of octal digits with maximum length 3.
114 // "\0123" is a two digit sequence equal to "\012" "3".
115 unsigned NumDigits = 0;
116 do {
117 ResultChar <<= 3;
118 ResultChar |= *ThisTokBuf++ - '0';
119 ++NumDigits;
120 } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
121 ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
122
123 // Check for overflow. Reject '\777', but not L'\777'.
Chris Lattner98be4942008-03-05 18:54:05 +0000124 unsigned CharWidth = PP.getTargetInfo().getCharWidth(IsWide);
Ted Kremenek9c728dc2007-12-12 22:39:36 +0000125
Reid Spencer5f016e22007-07-11 17:01:13 +0000126 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
127 PP.Diag(Loc, diag::warn_octal_escape_too_large);
128 ResultChar &= ~0U >> (32-CharWidth);
129 }
130 break;
131 }
132
133 // Otherwise, these are not valid escapes.
134 case '(': case '{': case '[': case '%':
135 // GCC accepts these as extensions. We warn about them as such though.
136 if (!PP.getLangOptions().NoExtensions) {
137 PP.Diag(Loc, diag::ext_nonstandard_escape,
138 std::string()+(char)ResultChar);
139 break;
140 }
141 // FALL THROUGH.
142 default:
143 if (isgraph(ThisTokBuf[0])) {
144 PP.Diag(Loc, diag::ext_unknown_escape, std::string()+(char)ResultChar);
145 } else {
146 PP.Diag(Loc, diag::ext_unknown_escape, "x"+llvm::utohexstr(ResultChar));
147 }
148 break;
149 }
150
151 return ResultChar;
152}
153
154
155
156
157/// integer-constant: [C99 6.4.4.1]
158/// decimal-constant integer-suffix
159/// octal-constant integer-suffix
160/// hexadecimal-constant integer-suffix
161/// decimal-constant:
162/// nonzero-digit
163/// decimal-constant digit
164/// octal-constant:
165/// 0
166/// octal-constant octal-digit
167/// hexadecimal-constant:
168/// hexadecimal-prefix hexadecimal-digit
169/// hexadecimal-constant hexadecimal-digit
170/// hexadecimal-prefix: one of
171/// 0x 0X
172/// integer-suffix:
173/// unsigned-suffix [long-suffix]
174/// unsigned-suffix [long-long-suffix]
175/// long-suffix [unsigned-suffix]
176/// long-long-suffix [unsigned-sufix]
177/// nonzero-digit:
178/// 1 2 3 4 5 6 7 8 9
179/// octal-digit:
180/// 0 1 2 3 4 5 6 7
181/// hexadecimal-digit:
182/// 0 1 2 3 4 5 6 7 8 9
183/// a b c d e f
184/// A B C D E F
185/// unsigned-suffix: one of
186/// u U
187/// long-suffix: one of
188/// l L
189/// long-long-suffix: one of
190/// ll LL
191///
192/// floating-constant: [C99 6.4.4.2]
193/// TODO: add rules...
194///
Reid Spencer5f016e22007-07-11 17:01:13 +0000195NumericLiteralParser::
196NumericLiteralParser(const char *begin, const char *end,
197 SourceLocation TokLoc, Preprocessor &pp)
198 : PP(pp), ThisTokBegin(begin), ThisTokEnd(end) {
199 s = DigitsBegin = begin;
200 saw_exponent = false;
201 saw_period = false;
Reid Spencer5f016e22007-07-11 17:01:13 +0000202 isLong = false;
203 isUnsigned = false;
204 isLongLong = false;
Chris Lattner6e400c22007-08-26 03:29:23 +0000205 isFloat = false;
Chris Lattner506b8de2007-08-26 01:58:14 +0000206 isImaginary = false;
Reid Spencer5f016e22007-07-11 17:01:13 +0000207 hadError = false;
208
209 if (*s == '0') { // parse radix
Chris Lattner368328c2008-06-30 06:39:54 +0000210 ParseNumberStartingWithZero(TokLoc);
211 if (hadError)
212 return;
Reid Spencer5f016e22007-07-11 17:01:13 +0000213 } else { // the first digit is non-zero
214 radix = 10;
215 s = SkipDigits(s);
216 if (s == ThisTokEnd) {
217 // Done.
Christopher Lamb016765e2007-11-29 06:06:27 +0000218 } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) {
Chris Lattner0b7f69d2008-04-20 18:41:46 +0000219 Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
220 diag::err_invalid_decimal_digit, std::string(s, s+1));
Reid Spencer5f016e22007-07-11 17:01:13 +0000221 return;
222 } else if (*s == '.') {
223 s++;
224 saw_period = true;
225 s = SkipDigits(s);
226 }
Chris Lattner4411f462008-09-29 23:12:31 +0000227 if ((*s == 'e' || *s == 'E')) { // exponent
Chris Lattner70f66ab2008-04-20 18:47:55 +0000228 const char *Exponent = s;
Reid Spencer5f016e22007-07-11 17:01:13 +0000229 s++;
230 saw_exponent = true;
231 if (*s == '+' || *s == '-') s++; // sign
232 const char *first_non_digit = SkipDigits(s);
Chris Lattner0b7f69d2008-04-20 18:41:46 +0000233 if (first_non_digit != s) {
Reid Spencer5f016e22007-07-11 17:01:13 +0000234 s = first_non_digit;
Chris Lattner0b7f69d2008-04-20 18:41:46 +0000235 } else {
Chris Lattner70f66ab2008-04-20 18:47:55 +0000236 Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-begin),
Chris Lattner0b7f69d2008-04-20 18:41:46 +0000237 diag::err_exponent_has_no_digits);
238 return;
Reid Spencer5f016e22007-07-11 17:01:13 +0000239 }
240 }
241 }
242
243 SuffixBegin = s;
Chris Lattner506b8de2007-08-26 01:58:14 +0000244
245 // Parse the suffix. At this point we can classify whether we have an FP or
246 // integer constant.
247 bool isFPConstant = isFloatingLiteral();
248
249 // Loop over all of the characters of the suffix. If we see something bad,
250 // we break out of the loop.
251 for (; s != ThisTokEnd; ++s) {
252 switch (*s) {
253 case 'f': // FP Suffix for "float"
254 case 'F':
255 if (!isFPConstant) break; // Error for integer constant.
Chris Lattner6e400c22007-08-26 03:29:23 +0000256 if (isFloat || isLong) break; // FF, LF invalid.
257 isFloat = true;
Chris Lattner506b8de2007-08-26 01:58:14 +0000258 continue; // Success.
259 case 'u':
260 case 'U':
261 if (isFPConstant) break; // Error for floating constant.
262 if (isUnsigned) break; // Cannot be repeated.
263 isUnsigned = true;
264 continue; // Success.
265 case 'l':
266 case 'L':
267 if (isLong || isLongLong) break; // Cannot be repeated.
Chris Lattner6e400c22007-08-26 03:29:23 +0000268 if (isFloat) break; // LF invalid.
Chris Lattner506b8de2007-08-26 01:58:14 +0000269
270 // Check for long long. The L's need to be adjacent and the same case.
271 if (s+1 != ThisTokEnd && s[1] == s[0]) {
272 if (isFPConstant) break; // long long invalid for floats.
273 isLongLong = true;
274 ++s; // Eat both of them.
275 } else {
Reid Spencer5f016e22007-07-11 17:01:13 +0000276 isLong = true;
Reid Spencer5f016e22007-07-11 17:01:13 +0000277 }
Chris Lattner506b8de2007-08-26 01:58:14 +0000278 continue; // Success.
279 case 'i':
Steve Naroff0c29b222008-04-04 21:02:54 +0000280 if (PP.getLangOptions().Microsoft) {
281 // Allow i8, i16, i32, i64, and i128.
282 if (++s == ThisTokEnd) break;
283 switch (*s) {
284 case '8':
285 s++; // i8 suffix
286 break;
287 case '1':
288 if (++s == ThisTokEnd) break;
289 if (*s == '6') s++; // i16 suffix
290 else if (*s == '2') {
291 if (++s == ThisTokEnd) break;
292 if (*s == '8') s++; // i128 suffix
293 }
294 break;
295 case '3':
296 if (++s == ThisTokEnd) break;
297 if (*s == '2') s++; // i32 suffix
298 break;
299 case '6':
300 if (++s == ThisTokEnd) break;
301 if (*s == '4') s++; // i64 suffix
302 break;
303 default:
304 break;
305 }
306 break;
307 }
308 // fall through.
Chris Lattner506b8de2007-08-26 01:58:14 +0000309 case 'I':
310 case 'j':
311 case 'J':
312 if (isImaginary) break; // Cannot be repeated.
313 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
314 diag::ext_imaginary_constant);
315 isImaginary = true;
316 continue; // Success.
Reid Spencer5f016e22007-07-11 17:01:13 +0000317 }
Chris Lattner506b8de2007-08-26 01:58:14 +0000318 // If we reached here, there was an error.
319 break;
320 }
321
322 // Report an error if there are any.
323 if (s != ThisTokEnd) {
Chris Lattner0b7f69d2008-04-20 18:41:46 +0000324 Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
325 isFPConstant ? diag::err_invalid_suffix_float_constant :
326 diag::err_invalid_suffix_integer_constant,
Chris Lattner506b8de2007-08-26 01:58:14 +0000327 std::string(SuffixBegin, ThisTokEnd));
328 return;
Reid Spencer5f016e22007-07-11 17:01:13 +0000329 }
330}
331
Chris Lattner368328c2008-06-30 06:39:54 +0000332/// ParseNumberStartingWithZero - This method is called when the first character
333/// of the number is found to be a zero. This means it is either an octal
334/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
335/// a floating point number (01239.123e4). Eat the prefix, determining the
336/// radix etc.
337void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
338 assert(s[0] == '0' && "Invalid method call");
339 s++;
340
341 // Handle a hex number like 0x1234.
342 if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) {
343 s++;
344 radix = 16;
345 DigitsBegin = s;
346 s = SkipHexDigits(s);
347 if (s == ThisTokEnd) {
348 // Done.
349 } else if (*s == '.') {
350 s++;
351 saw_period = true;
352 s = SkipHexDigits(s);
353 }
354 // A binary exponent can appear with or with a '.'. If dotted, the
355 // binary exponent is required.
Chris Lattner6ea62382008-07-25 18:18:34 +0000356 if (*s == 'p' || *s == 'P') {
Chris Lattner368328c2008-06-30 06:39:54 +0000357 const char *Exponent = s;
358 s++;
359 saw_exponent = true;
360 if (*s == '+' || *s == '-') s++; // sign
361 const char *first_non_digit = SkipDigits(s);
Chris Lattner6ea62382008-07-25 18:18:34 +0000362 if (first_non_digit == s) {
Chris Lattner368328c2008-06-30 06:39:54 +0000363 Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
364 diag::err_exponent_has_no_digits);
Chris Lattner6ea62382008-07-25 18:18:34 +0000365 return;
Chris Lattner368328c2008-06-30 06:39:54 +0000366 }
Chris Lattner6ea62382008-07-25 18:18:34 +0000367 s = first_non_digit;
368
369 if (!PP.getLangOptions().HexFloats)
370 Diag(TokLoc, diag::ext_hexconstant_invalid);
Chris Lattner368328c2008-06-30 06:39:54 +0000371 } else if (saw_period) {
372 Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
373 diag::err_hexconstant_requires_exponent);
374 }
375 return;
376 }
377
378 // Handle simple binary numbers 0b01010
379 if (*s == 'b' || *s == 'B') {
380 // 0b101010 is a GCC extension.
Chris Lattner413d3552008-06-30 06:44:49 +0000381 PP.Diag(TokLoc, diag::ext_binary_literal);
Chris Lattner368328c2008-06-30 06:39:54 +0000382 ++s;
383 radix = 2;
384 DigitsBegin = s;
385 s = SkipBinaryDigits(s);
386 if (s == ThisTokEnd) {
387 // Done.
388 } else if (isxdigit(*s)) {
389 Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
390 diag::err_invalid_binary_digit, std::string(s, s+1));
Chris Lattner368328c2008-06-30 06:39:54 +0000391 }
Chris Lattner413d3552008-06-30 06:44:49 +0000392 // Other suffixes will be diagnosed by the caller.
Chris Lattner368328c2008-06-30 06:39:54 +0000393 return;
394 }
395
396 // For now, the radix is set to 8. If we discover that we have a
397 // floating point constant, the radix will change to 10. Octal floating
398 // point constants are not permitted (only decimal and hexadecimal).
399 radix = 8;
400 DigitsBegin = s;
401 s = SkipOctalDigits(s);
402 if (s == ThisTokEnd)
403 return; // Done, simple octal number like 01234
404
Chris Lattner413d3552008-06-30 06:44:49 +0000405 // If we have some other non-octal digit that *is* a decimal digit, see if
406 // this is part of a floating point number like 094.123 or 09e1.
407 if (isdigit(*s)) {
408 const char *EndDecimal = SkipDigits(s);
409 if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
410 s = EndDecimal;
411 radix = 10;
412 }
413 }
414
415 // If we have a hex digit other than 'e' (which denotes a FP exponent) then
416 // the code is using an incorrect base.
Chris Lattner368328c2008-06-30 06:39:54 +0000417 if (isxdigit(*s) && *s != 'e' && *s != 'E') {
418 Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
419 diag::err_invalid_octal_digit, std::string(s, s+1));
420 return;
421 }
422
423 if (*s == '.') {
424 s++;
425 radix = 10;
426 saw_period = true;
Chris Lattner413d3552008-06-30 06:44:49 +0000427 s = SkipDigits(s); // Skip suffix.
Chris Lattner368328c2008-06-30 06:39:54 +0000428 }
429 if (*s == 'e' || *s == 'E') { // exponent
430 const char *Exponent = s;
431 s++;
432 radix = 10;
433 saw_exponent = true;
434 if (*s == '+' || *s == '-') s++; // sign
435 const char *first_non_digit = SkipDigits(s);
436 if (first_non_digit != s) {
437 s = first_non_digit;
438 } else {
439 Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
440 diag::err_exponent_has_no_digits);
441 return;
442 }
443 }
444}
445
446
Reid Spencer5f016e22007-07-11 17:01:13 +0000447/// GetIntegerValue - Convert this numeric literal value to an APInt that
448/// matches Val's input width. If there is an overflow, set Val to the low bits
449/// of the result and return true. Otherwise, return false.
450bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
451 Val = 0;
452 s = DigitsBegin;
453
454 llvm::APInt RadixVal(Val.getBitWidth(), radix);
455 llvm::APInt CharVal(Val.getBitWidth(), 0);
456 llvm::APInt OldVal = Val;
457
458 bool OverflowOccurred = false;
459 while (s < SuffixBegin) {
460 unsigned C = HexDigitValue(*s++);
461
462 // If this letter is out of bound for this radix, reject it.
463 assert(C < radix && "NumericLiteralParser ctor should have rejected this");
464
465 CharVal = C;
466
467 // Add the digit to the value in the appropriate radix. If adding in digits
468 // made the value smaller, then this overflowed.
469 OldVal = Val;
470
471 // Multiply by radix, did overflow occur on the multiply?
472 Val *= RadixVal;
473 OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
474
475 OldVal = Val;
476 // Add value, did overflow occur on the value?
477 Val += CharVal;
478 OverflowOccurred |= Val.ult(OldVal);
479 OverflowOccurred |= Val.ult(CharVal);
480 }
481 return OverflowOccurred;
482}
483
Chris Lattner525a0502007-09-22 18:29:59 +0000484llvm::APFloat NumericLiteralParser::
Ted Kremenek427d5af2007-11-26 23:12:30 +0000485GetFloatValue(const llvm::fltSemantics &Format, bool* isExact) {
486 using llvm::APFloat;
487
Ted Kremenek32e61bf2007-11-29 00:54:29 +0000488 llvm::SmallVector<char,256> floatChars;
489 for (unsigned i = 0, n = ThisTokEnd-ThisTokBegin; i != n; ++i)
490 floatChars.push_back(ThisTokBegin[i]);
491
492 floatChars.push_back('\0');
493
Ted Kremenek427d5af2007-11-26 23:12:30 +0000494 APFloat V (Format, APFloat::fcZero, false);
Ted Kremenek427d5af2007-11-26 23:12:30 +0000495 APFloat::opStatus status;
Ted Kremenek32e61bf2007-11-29 00:54:29 +0000496
497 status = V.convertFromString(&floatChars[0],APFloat::rmNearestTiesToEven);
Ted Kremenek427d5af2007-11-26 23:12:30 +0000498
499 if (isExact)
500 *isExact = status == APFloat::opOK;
501
502 return V;
Reid Spencer5f016e22007-07-11 17:01:13 +0000503}
504
505void NumericLiteralParser::Diag(SourceLocation Loc, unsigned DiagID,
506 const std::string &M) {
507 PP.Diag(Loc, DiagID, M);
508 hadError = true;
509}
510
511
512CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
513 SourceLocation Loc, Preprocessor &PP) {
514 // At this point we know that the character matches the regex "L?'.*'".
515 HadError = false;
516 Value = 0;
517
518 // Determine if this is a wide character.
519 IsWide = begin[0] == 'L';
520 if (IsWide) ++begin;
521
522 // Skip over the entry quote.
523 assert(begin[0] == '\'' && "Invalid token lexed");
524 ++begin;
525
526 // FIXME: This assumes that 'int' is 32-bits in overflow calculation, and the
527 // size of "value".
Chris Lattner98be4942008-03-05 18:54:05 +0000528 assert(PP.getTargetInfo().getIntWidth() == 32 &&
Reid Spencer5f016e22007-07-11 17:01:13 +0000529 "Assumes sizeof(int) == 4 for now");
530 // FIXME: This assumes that wchar_t is 32-bits for now.
Chris Lattner98be4942008-03-05 18:54:05 +0000531 assert(PP.getTargetInfo().getWCharWidth() == 32 &&
Reid Spencer5f016e22007-07-11 17:01:13 +0000532 "Assumes sizeof(wchar_t) == 4 for now");
533 // FIXME: This extensively assumes that 'char' is 8-bits.
Chris Lattner98be4942008-03-05 18:54:05 +0000534 assert(PP.getTargetInfo().getCharWidth() == 8 &&
Reid Spencer5f016e22007-07-11 17:01:13 +0000535 "Assumes char is 8 bits");
536
537 bool isFirstChar = true;
538 bool isMultiChar = false;
539 while (begin[0] != '\'') {
540 unsigned ResultChar;
541 if (begin[0] != '\\') // If this is a normal character, consume it.
542 ResultChar = *begin++;
543 else // Otherwise, this is an escape character.
544 ResultChar = ProcessCharEscape(begin, end, HadError, Loc, IsWide, PP);
545
546 // If this is a multi-character constant (e.g. 'abc'), handle it. These are
547 // implementation defined (C99 6.4.4.4p10).
548 if (!isFirstChar) {
549 // If this is the second character being processed, do special handling.
550 if (!isMultiChar) {
551 isMultiChar = true;
552
553 // Warn about discarding the top bits for multi-char wide-character
554 // constants (L'abcd').
555 if (IsWide)
556 PP.Diag(Loc, diag::warn_extraneous_wide_char_constant);
557 }
558
559 if (IsWide) {
560 // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'.
561 Value = 0;
562 } else {
563 // Narrow character literals act as though their value is concatenated
564 // in this implementation.
565 if (((Value << 8) >> 8) != Value)
566 PP.Diag(Loc, diag::warn_char_constant_too_large);
567 Value <<= 8;
568 }
569 }
570
571 Value += ResultChar;
572 isFirstChar = false;
573 }
574
575 // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
576 // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
577 // character constants are not sign extended in the this implementation:
578 // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
579 if (!IsWide && !isMultiChar && (Value & 128) &&
Chris Lattner98be4942008-03-05 18:54:05 +0000580 PP.getTargetInfo().isCharSigned())
Reid Spencer5f016e22007-07-11 17:01:13 +0000581 Value = (signed char)Value;
582}
583
584
585/// string-literal: [C99 6.4.5]
586/// " [s-char-sequence] "
587/// L" [s-char-sequence] "
588/// s-char-sequence:
589/// s-char
590/// s-char-sequence s-char
591/// s-char:
592/// any source character except the double quote ",
593/// backslash \, or newline character
594/// escape-character
595/// universal-character-name
596/// escape-character: [C99 6.4.4.4]
597/// \ escape-code
598/// universal-character-name
599/// escape-code:
600/// character-escape-code
601/// octal-escape-code
602/// hex-escape-code
603/// character-escape-code: one of
604/// n t b r f v a
605/// \ ' " ?
606/// octal-escape-code:
607/// octal-digit
608/// octal-digit octal-digit
609/// octal-digit octal-digit octal-digit
610/// hex-escape-code:
611/// x hex-digit
612/// hex-escape-code hex-digit
613/// universal-character-name:
614/// \u hex-quad
615/// \U hex-quad hex-quad
616/// hex-quad:
617/// hex-digit hex-digit hex-digit hex-digit
618///
619StringLiteralParser::
Chris Lattnerd2177732007-07-20 16:59:19 +0000620StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
Reid Spencer5f016e22007-07-11 17:01:13 +0000621 Preprocessor &pp, TargetInfo &t)
622 : PP(pp), Target(t) {
623 // Scan all of the string portions, remember the max individual token length,
624 // computing a bound on the concatenated string length, and see whether any
625 // piece is a wide-string. If any of the string portions is a wide-string
626 // literal, the result is a wide-string literal [C99 6.4.5p4].
627 MaxTokenLength = StringToks[0].getLength();
628 SizeBound = StringToks[0].getLength()-2; // -2 for "".
Chris Lattner22f6bbc2007-10-09 18:02:16 +0000629 AnyWide = StringToks[0].is(tok::wide_string_literal);
Reid Spencer5f016e22007-07-11 17:01:13 +0000630
631 hadError = false;
632
633 // Implement Translation Phase #6: concatenation of string literals
634 /// (C99 5.1.1.2p1). The common case is only one string fragment.
635 for (unsigned i = 1; i != NumStringToks; ++i) {
636 // The string could be shorter than this if it needs cleaning, but this is a
637 // reasonable bound, which is all we need.
638 SizeBound += StringToks[i].getLength()-2; // -2 for "".
639
640 // Remember maximum string piece length.
641 if (StringToks[i].getLength() > MaxTokenLength)
642 MaxTokenLength = StringToks[i].getLength();
643
644 // Remember if we see any wide strings.
Chris Lattner22f6bbc2007-10-09 18:02:16 +0000645 AnyWide |= StringToks[i].is(tok::wide_string_literal);
Reid Spencer5f016e22007-07-11 17:01:13 +0000646 }
647
648
649 // Include space for the null terminator.
650 ++SizeBound;
651
652 // TODO: K&R warning: "traditional C rejects string constant concatenation"
653
654 // Get the width in bytes of wchar_t. If no wchar_t strings are used, do not
655 // query the target. As such, wchar_tByteWidth is only valid if AnyWide=true.
656 wchar_tByteWidth = ~0U;
657 if (AnyWide) {
Chris Lattner98be4942008-03-05 18:54:05 +0000658 wchar_tByteWidth = Target.getWCharWidth();
Reid Spencer5f016e22007-07-11 17:01:13 +0000659 assert((wchar_tByteWidth & 7) == 0 && "Assumes wchar_t is byte multiple!");
660 wchar_tByteWidth /= 8;
661 }
662
663 // The output buffer size needs to be large enough to hold wide characters.
664 // This is a worst-case assumption which basically corresponds to L"" "long".
665 if (AnyWide)
666 SizeBound *= wchar_tByteWidth;
667
668 // Size the temporary buffer to hold the result string data.
669 ResultBuf.resize(SizeBound);
670
671 // Likewise, but for each string piece.
672 llvm::SmallString<512> TokenBuf;
673 TokenBuf.resize(MaxTokenLength);
674
675 // Loop over all the strings, getting their spelling, and expanding them to
676 // wide strings as appropriate.
677 ResultPtr = &ResultBuf[0]; // Next byte to fill in.
678
Anders Carlssonee98ac52007-10-15 02:50:23 +0000679 Pascal = false;
680
Reid Spencer5f016e22007-07-11 17:01:13 +0000681 for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
682 const char *ThisTokBuf = &TokenBuf[0];
683 // Get the spelling of the token, which eliminates trigraphs, etc. We know
684 // that ThisTokBuf points to a buffer that is big enough for the whole token
685 // and 'spelled' tokens can only shrink.
686 unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf);
687 const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end quote.
688
689 // TODO: Input character set mapping support.
690
691 // Skip L marker for wide strings.
692 bool ThisIsWide = false;
693 if (ThisTokBuf[0] == 'L') {
694 ++ThisTokBuf;
695 ThisIsWide = true;
696 }
697
698 assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
699 ++ThisTokBuf;
700
Anders Carlssonee98ac52007-10-15 02:50:23 +0000701 // Check if this is a pascal string
702 if (pp.getLangOptions().PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
703 ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
704
705 // If the \p sequence is found in the first token, we have a pascal string
706 // Otherwise, if we already have a pascal string, ignore the first \p
707 if (i == 0) {
708 ++ThisTokBuf;
709 Pascal = true;
710 } else if (Pascal)
711 ThisTokBuf += 2;
712 }
713
Reid Spencer5f016e22007-07-11 17:01:13 +0000714 while (ThisTokBuf != ThisTokEnd) {
715 // Is this a span of non-escape characters?
716 if (ThisTokBuf[0] != '\\') {
717 const char *InStart = ThisTokBuf;
718 do {
719 ++ThisTokBuf;
720 } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
721
722 // Copy the character span over.
723 unsigned Len = ThisTokBuf-InStart;
724 if (!AnyWide) {
725 memcpy(ResultPtr, InStart, Len);
726 ResultPtr += Len;
727 } else {
728 // Note: our internal rep of wide char tokens is always little-endian.
729 for (; Len; --Len, ++InStart) {
730 *ResultPtr++ = InStart[0];
731 // Add zeros at the end.
732 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
733 *ResultPtr++ = 0;
734 }
735 }
736 continue;
737 }
738
739 // Otherwise, this is an escape character. Process it.
740 unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
741 StringToks[i].getLocation(),
742 ThisIsWide, PP);
743
744 // Note: our internal rep of wide char tokens is always little-endian.
745 *ResultPtr++ = ResultChar & 0xFF;
746
747 if (AnyWide) {
748 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
749 *ResultPtr++ = ResultChar >> i*8;
750 }
751 }
752 }
753
754 // Add zero terminator.
755 *ResultPtr = 0;
756 if (AnyWide) {
757 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
758 *ResultPtr++ = 0;
759 }
Anders Carlssonee98ac52007-10-15 02:50:23 +0000760
761 if (Pascal)
762 ResultBuf[0] = ResultPtr-&ResultBuf[0]-1;
Reid Spencer5f016e22007-07-11 17:01:13 +0000763}