blob: 944030f8c5a388b873eb933ef5b14ddc2b222499 [file] [log] [blame]
Steve Naroff09ef4742007-03-09 23:16:33 +00001//===--- LiteralSupport.cpp - Code to parse and process literals-*- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file was developed by Steve Naroff and is distributed under
6// the University of Illinois Open Source License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements the NumericLiteralParser interface.
11//
12//===----------------------------------------------------------------------===//
13
14#include "clang/Lex/LiteralSupport.h"
15#include "clang/Lex/Preprocessor.h"
16#include "clang/Basic/TargetInfo.h"
17#include "clang/Basic/Diagnostic.h"
Chris Lattner5b743d32007-04-04 05:52:58 +000018#include "llvm/ADT/APInt.h"
Steve Naroff4f88b312007-03-13 22:37:02 +000019#include "llvm/ADT/StringExtras.h"
Steve Naroff09ef4742007-03-09 23:16:33 +000020using namespace llvm;
21using namespace clang;
22
23/// integer-constant: [C99 6.4.4.1]
24/// decimal-constant integer-suffix
25/// octal-constant integer-suffix
26/// hexadecimal-constant integer-suffix
27/// decimal-constant:
28/// nonzero-digit
29/// decimal-constant digit
30/// octal-constant:
31/// 0
32/// octal-constant octal-digit
33/// hexadecimal-constant:
34/// hexadecimal-prefix hexadecimal-digit
35/// hexadecimal-constant hexadecimal-digit
36/// hexadecimal-prefix: one of
37/// 0x 0X
38/// integer-suffix:
39/// unsigned-suffix [long-suffix]
40/// unsigned-suffix [long-long-suffix]
41/// long-suffix [unsigned-suffix]
42/// long-long-suffix [unsigned-sufix]
43/// nonzero-digit:
44/// 1 2 3 4 5 6 7 8 9
45/// octal-digit:
46/// 0 1 2 3 4 5 6 7
47/// hexadecimal-digit:
48/// 0 1 2 3 4 5 6 7 8 9
49/// a b c d e f
50/// A B C D E F
51/// unsigned-suffix: one of
52/// u U
53/// long-suffix: one of
54/// l L
55/// long-long-suffix: one of
56/// ll LL
57///
58/// floating-constant: [C99 6.4.4.2]
59/// TODO: add rules...
60///
61
62NumericLiteralParser::
63NumericLiteralParser(const char *begin, const char *end,
Steve Naroff451d8f162007-03-12 23:22:38 +000064 SourceLocation TokLoc, Preprocessor &pp) :
65 PP(pp), ThisTokBegin(begin), ThisTokEnd(end)
Steve Naroff09ef4742007-03-09 23:16:33 +000066{
67 s = DigitsBegin = begin;
68 saw_exponent = false;
69 saw_period = false;
70 saw_float_suffix = false;
71 isLong = false;
72 isUnsigned = false;
73 isLongLong = false;
74 hadError = false;
75
76 if (*s == '0') { // parse radix
77 s++;
78 if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) {
79 s++;
80 radix = 16;
81 DigitsBegin = s;
82 s = SkipHexDigits(s);
83 if (s == ThisTokEnd) {
84 } else if (*s == '.') {
85 s++;
86 saw_period = true;
87 s = SkipHexDigits(s);
88 }
89 // A binary exponent can appear with or with a '.'. If dotted, the
90 // binary exponent is required.
91 if (*s == 'p' || *s == 'P') {
92 s++;
93 saw_exponent = true;
94 if (*s == '+' || *s == '-') s++; // sign
95 const char *first_non_digit = SkipDigits(s);
96 if (first_non_digit == s) {
97 Diag(TokLoc, diag::err_exponent_has_no_digits);
98 return;
99 } else {
100 s = first_non_digit;
101 }
102 } else if (saw_period) {
103 Diag(TokLoc, diag::err_hexconstant_requires_exponent);
104 return;
105 }
106 } else {
107 // For now, the radix is set to 8. If we discover that we have a
108 // floating point constant, the radix will change to 10. Octal floating
109 // point constants are not permitted (only decimal and hexadecimal).
110 radix = 8;
111 DigitsBegin = s;
112 s = SkipOctalDigits(s);
113 if (s == ThisTokEnd) {
114 } else if (*s == '.') {
115 s++;
116 radix = 10;
117 saw_period = true;
118 s = SkipDigits(s);
119 }
120 if (*s == 'e' || *s == 'E') { // exponent
121 s++;
122 radix = 10;
123 saw_exponent = true;
124 if (*s == '+' || *s == '-') s++; // sign
125 const char *first_non_digit = SkipDigits(s);
126 if (first_non_digit == s) {
127 Diag(TokLoc, diag::err_exponent_has_no_digits);
128 return;
129 } else {
130 s = first_non_digit;
131 }
132 }
133 }
134 } else { // the first digit is non-zero
135 radix = 10;
136 s = SkipDigits(s);
137 if (s == ThisTokEnd) {
138 } else if (*s == '.') {
139 s++;
140 saw_period = true;
141 s = SkipDigits(s);
142 }
143 if (*s == 'e' || *s == 'E') { // exponent
144 s++;
145 saw_exponent = true;
146 if (*s == '+' || *s == '-') s++; // sign
147 const char *first_non_digit = SkipDigits(s);
148 if (first_non_digit == s) {
149 Diag(TokLoc, diag::err_exponent_has_no_digits);
150 return;
151 } else {
152 s = first_non_digit;
153 }
154 }
155 }
156
157 SuffixBegin = s;
158
159 if (saw_period || saw_exponent) {
160 if (s < ThisTokEnd) { // parse size suffix (float, long double)
161 if (*s == 'f' || *s == 'F') {
162 saw_float_suffix = true;
163 s++;
164 } else if (*s == 'l' || *s == 'L') {
165 isLong = true;
166 s++;
167 }
168 if (s != ThisTokEnd) {
169 Diag(TokLoc, diag::err_invalid_suffix_float_constant,
170 std::string(SuffixBegin, ThisTokEnd));
171 return;
172 }
173 }
174 } else {
175 if (s < ThisTokEnd) {
176 // parse int suffix - they can appear in any order ("ul", "lu", "llu").
177 if (*s == 'u' || *s == 'U') {
178 s++;
179 isUnsigned = true; // unsigned
180
181 if ((s < ThisTokEnd) && (*s == 'l' || *s == 'L')) {
182 s++;
183 // handle "long long" type - l's need to be adjacent and same case.
184 if ((s < ThisTokEnd) && (*s == *(s-1))) {
185 isLongLong = true; // unsigned long long
186 s++;
187 } else {
188 isLong = true; // unsigned long
189 }
190 }
191 } else if (*s == 'l' || *s == 'L') {
192 s++;
193 // handle "long long" types - l's need to be adjacent and same case.
194 if ((s < ThisTokEnd) && (*s == *(s-1))) {
195 s++;
196 if ((s < ThisTokEnd) && (*s == 'u' || *s == 'U')) {
197 isUnsigned = true; // unsigned long long
198 s++;
199 } else {
200 isLongLong = true; // long long
201 }
202 } else { // handle "long" types
203 if ((s < ThisTokEnd) && (*s == 'u' || *s == 'U')) {
204 isUnsigned = true; // unsigned long
205 s++;
206 } else {
207 isLong = true; // long
208 }
209 }
210 }
211 if (s != ThisTokEnd) {
212 Diag(TokLoc, diag::err_invalid_suffix_integer_constant,
213 std::string(SuffixBegin, ThisTokEnd));
214 return;
215 }
216 }
217 }
218}
219
Chris Lattner5b743d32007-04-04 05:52:58 +0000220static unsigned HexLetterToVal(char c) {
221 if (c >= '0' && c <= '9')
222 return c - '0';
223 else if (c >= 'A' && c <= 'F')
224 return c - 'A' - 10;
225 else
226 assert(c >= 'a' && c <= 'f' && "Lexer scanning error");
227 return c - 'a' - 10;
228}
229
Steve Naroff451d8f162007-03-12 23:22:38 +0000230bool NumericLiteralParser::GetIntegerValue(uintmax_t &val) {
Steve Narofff2fb89e2007-03-13 20:29:44 +0000231 uintmax_t max_value = UINTMAX_MAX / radix;
Chris Lattner5b743d32007-04-04 05:52:58 +0000232 unsigned max_digit = UINTMAX_MAX % radix;
Steve Naroff09ef4742007-03-09 23:16:33 +0000233
234 val = 0;
Steve Naroff451d8f162007-03-12 23:22:38 +0000235 s = DigitsBegin;
236 while (s < SuffixBegin) {
Chris Lattner5b743d32007-04-04 05:52:58 +0000237 unsigned C = HexLetterToVal(*s++);
Steve Naroff451d8f162007-03-12 23:22:38 +0000238
Chris Lattner5b743d32007-04-04 05:52:58 +0000239 if (val > max_value || (val == max_value && C > max_digit)) {
Steve Naroff451d8f162007-03-12 23:22:38 +0000240 return false; // Overflow!
241 } else {
242 val *= radix;
Chris Lattner5b743d32007-04-04 05:52:58 +0000243 val += C;
Steve Naroff451d8f162007-03-12 23:22:38 +0000244 }
245 }
246 return true;
247}
248
249bool NumericLiteralParser::GetIntegerValue(int &val) {
Steve Narofff2fb89e2007-03-13 20:29:44 +0000250 intmax_t max_value = INT_MAX / radix;
Chris Lattner5b743d32007-04-04 05:52:58 +0000251 unsigned max_digit = INT_MAX % radix;
Steve Naroff451d8f162007-03-12 23:22:38 +0000252
253 val = 0;
Steve Naroff09ef4742007-03-09 23:16:33 +0000254 s = DigitsBegin;
255 while (s < SuffixBegin) {
Chris Lattner5b743d32007-04-04 05:52:58 +0000256 unsigned C = HexLetterToVal(*s++);
Steve Naroff09ef4742007-03-09 23:16:33 +0000257
Chris Lattner5b743d32007-04-04 05:52:58 +0000258 if (val > max_value || (val == max_value && C > max_digit)) {
Steve Naroff09ef4742007-03-09 23:16:33 +0000259 return false; // Overflow!
260 } else {
261 val *= radix;
Chris Lattner5b743d32007-04-04 05:52:58 +0000262 val += C;
Steve Naroff09ef4742007-03-09 23:16:33 +0000263 }
264 }
265 return true;
266}
Steve Narofff2fb89e2007-03-13 20:29:44 +0000267
Chris Lattner5b743d32007-04-04 05:52:58 +0000268/// GetIntegerValue - Convert this numeric literal value to an APInt that
Chris Lattner871b4e12007-04-04 06:36:34 +0000269/// matches Val's input width. If there is an overflow, set Val to the low bits
270/// of the result and return true. Otherwise, return false.
Chris Lattner5b743d32007-04-04 05:52:58 +0000271bool NumericLiteralParser::GetIntegerValue(APInt &Val) {
272 Val = 0;
273 s = DigitsBegin;
274
Chris Lattner5b743d32007-04-04 05:52:58 +0000275 APInt RadixVal(Val.getBitWidth(), radix);
276 APInt CharVal(Val.getBitWidth(), 0);
277 APInt OldVal = Val;
Chris Lattner871b4e12007-04-04 06:36:34 +0000278
279 bool OverflowOccurred = false;
Chris Lattner5b743d32007-04-04 05:52:58 +0000280 while (s < SuffixBegin) {
281 unsigned C = HexLetterToVal(*s++);
282
283 // If this letter is out of bound for this radix, reject it.
Chris Lattner871b4e12007-04-04 06:36:34 +0000284 if (C >= radix) {
285 // FIXME: This is an error, not a warning. This should be caught by
286 // NumericLiteralParser ctor.
287 C = C % radix;
288 OverflowOccurred = true;
289 }
Chris Lattner5b743d32007-04-04 05:52:58 +0000290
291 CharVal = C;
292
Chris Lattner871b4e12007-04-04 06:36:34 +0000293 // Add the digit to the value in the appropriate radix. If adding in digits
294 // made the value smaller, then this overflowed.
Chris Lattner5b743d32007-04-04 05:52:58 +0000295 OldVal = Val;
Chris Lattner871b4e12007-04-04 06:36:34 +0000296
297 // Multiply by radix, did overflow occur on the multiply?
Chris Lattner5b743d32007-04-04 05:52:58 +0000298 Val *= RadixVal;
Chris Lattner871b4e12007-04-04 06:36:34 +0000299 OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
300
301 OldVal = Val;
302 // Add value, did overflow occur on the value?
Chris Lattner5b743d32007-04-04 05:52:58 +0000303 Val += CharVal;
Chris Lattner871b4e12007-04-04 06:36:34 +0000304 OverflowOccurred |= Val.ult(OldVal);
305 OverflowOccurred |= Val.ult(CharVal);
Chris Lattner5b743d32007-04-04 05:52:58 +0000306 }
Chris Lattner871b4e12007-04-04 06:36:34 +0000307 return OverflowOccurred;
Chris Lattner5b743d32007-04-04 05:52:58 +0000308}
309
310
Steve Narofff2fb89e2007-03-13 20:29:44 +0000311void NumericLiteralParser::Diag(SourceLocation Loc, unsigned DiagID,
312 const std::string &M) {
313 PP.Diag(Loc, DiagID, M);
314 hadError = true;
315}
Steve Naroff4f88b312007-03-13 22:37:02 +0000316
317/// string-literal: [C99 6.4.5]
318/// " [s-char-sequence] "
319/// L" [s-char-sequence] "
320/// s-char-sequence:
321/// s-char
322/// s-char-sequence s-char
323/// s-char:
324/// any source character except the double quote ",
325/// backslash \, or newline character
326/// escape-character
327/// universal-character-name
328/// escape-character: [C99 6.4.4.4]
329/// \ escape-code
330/// universal-character-name
331/// escape-code:
332/// character-escape-code
333/// octal-escape-code
334/// hex-escape-code
335/// character-escape-code: one of
336/// n t b r f v a
337/// \ ' " ?
338/// octal-escape-code:
339/// octal-digit
340/// octal-digit octal-digit
341/// octal-digit octal-digit octal-digit
342/// hex-escape-code:
343/// x hex-digit
344/// hex-escape-code hex-digit
345/// universal-character-name:
346/// \u hex-quad
347/// \U hex-quad hex-quad
348/// hex-quad:
349/// hex-digit hex-digit hex-digit hex-digit
350
351StringLiteralParser::
352StringLiteralParser(const LexerToken *StringToks, unsigned NumStringToks,
353 Preprocessor &pp, TargetInfo &t) :
354 PP(pp), Target(t)
355{
356 // Scan all of the string portions, remember the max individual token length,
357 // computing a bound on the concatenated string length, and see whether any
358 // piece is a wide-string. If any of the string portions is a wide-string
359 // literal, the result is a wide-string literal [C99 6.4.5p4].
360 MaxTokenLength = StringToks[0].getLength();
361 SizeBound = StringToks[0].getLength()-2; // -2 for "".
362 AnyWide = StringToks[0].getKind() == tok::wide_string_literal;
363
Steve Narofff1e53692007-03-23 22:27:02 +0000364 hadError = false;
365
Steve Naroff4f88b312007-03-13 22:37:02 +0000366 // The common case is that there is only one string fragment.
367 for (unsigned i = 1; i != NumStringToks; ++i) {
368 // The string could be shorter than this if it needs cleaning, but this is a
369 // reasonable bound, which is all we need.
370 SizeBound += StringToks[i].getLength()-2; // -2 for "".
371
372 // Remember maximum string piece length.
373 if (StringToks[i].getLength() > MaxTokenLength)
374 MaxTokenLength = StringToks[i].getLength();
375
376 // Remember if we see any wide strings.
377 AnyWide |= StringToks[i].getKind() == tok::wide_string_literal;
378 }
379
380
381 // Include space for the null terminator.
382 ++SizeBound;
383
384 // TODO: K&R warning: "traditional C rejects string constant concatenation"
385
386 // Get the width in bytes of wchar_t. If no wchar_t strings are used, do not
387 // query the target. As such, wchar_tByteWidth is only valid if AnyWide=true.
388 wchar_tByteWidth = ~0U;
389 if (AnyWide)
390 wchar_tByteWidth = Target.getWCharWidth(StringToks[0].getLocation());
391
392 // The output buffer size needs to be large enough to hold wide characters.
393 // This is a worst-case assumption which basically corresponds to L"" "long".
394 if (AnyWide)
395 SizeBound *= wchar_tByteWidth;
396
397 // Size the temporary buffer to hold the result string data.
398 ResultBuf.resize(SizeBound);
399
400 // Likewise, but for each string piece.
401 SmallString<512> TokenBuf;
402 TokenBuf.resize(MaxTokenLength);
403
404 // Loop over all the strings, getting their spelling, and expanding them to
405 // wide strings as appropriate.
406 ResultPtr = &ResultBuf[0]; // Next byte to fill in.
407
408 for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
409 const char *ThisTokBuf = &TokenBuf[0];
410 // Get the spelling of the token, which eliminates trigraphs, etc. We know
411 // that ThisTokBuf points to a buffer that is big enough for the whole token
412 // and 'spelled' tokens can only shrink.
413 unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf);
414 const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end quote.
415
416 // TODO: Input character set mapping support.
417
418 // Skip L marker for wide strings.
419 if (ThisTokBuf[0] == 'L') ++ThisTokBuf;
420
421 assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
422 ++ThisTokBuf;
423
424 while (ThisTokBuf != ThisTokEnd) {
425 // Is this a span of non-escape characters?
426 if (ThisTokBuf[0] != '\\') {
427 const char *InStart = ThisTokBuf;
428 do {
429 ++ThisTokBuf;
430 } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
431
432 // Copy the character span over.
433 unsigned Len = ThisTokBuf-InStart;
434 if (!AnyWide) {
435 memcpy(ResultPtr, InStart, Len);
436 ResultPtr += Len;
437 } else {
438 // Note: our internal rep of wide char tokens is always little-endian.
439 for (; Len; --Len, ++InStart) {
440 *ResultPtr++ = InStart[0];
441 // Add zeros at the end.
442 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
443 *ResultPtr++ = 0;
444 }
445 }
446 continue;
447 }
448
449 // Otherwise, this is an escape character. Skip the '\' char.
450 ++ThisTokBuf;
451
452 // We know that this character can't be off the end of the buffer, because
453 // that would have been \", which would not have been the end of string.
454 unsigned ResultChar = *ThisTokBuf++;
455 switch (ResultChar) {
456 // These map to themselves.
457 case '\\': case '\'': case '"': case '?': break;
458
459 // These have fixed mappings.
460 case 'a':
461 // TODO: K&R: the meaning of '\\a' is different in traditional C
462 ResultChar = 7;
463 break;
464 case 'b':
465 ResultChar = 8;
466 break;
467 case 'e':
468 Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape, "e");
469 ResultChar = 27;
470 break;
471 case 'f':
472 ResultChar = 12;
473 break;
474 case 'n':
475 ResultChar = 10;
476 break;
477 case 'r':
478 ResultChar = 13;
479 break;
480 case 't':
481 ResultChar = 9;
482 break;
483 case 'v':
484 ResultChar = 11;
485 break;
486
487 //case 'u': case 'U': // FIXME: UCNs.
488 case 'x': // Hex escape.
489 if (ThisTokBuf == ThisTokEnd ||
490 (ResultChar = HexDigitValue(*ThisTokBuf)) == ~0U) {
491 Diag(StringToks[i].getLocation(), diag::err_hex_escape_no_digits);
492 ResultChar = 0;
493 break;
494 }
495 ++ThisTokBuf; // Consumed one hex digit.
496
497 assert(0 && "hex escape: unimp!");
498 break;
499 case '0': case '1': case '2': case '3':
500 case '4': case '5': case '6': case '7':
501 // Octal escapes.
502 assert(0 && "octal escape: unimp!");
503 break;
504
505 // Otherwise, these are not valid escapes.
506 case '(': case '{': case '[': case '%':
507 // GCC accepts these as extensions. We warn about them as such though.
508 if (!PP.getLangOptions().NoExtensions) {
509 Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape,
510 std::string()+(char)ResultChar);
511 break;
512 }
513 // FALL THROUGH.
514 default:
515 if (isgraph(ThisTokBuf[0])) {
516 Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
517 std::string()+(char)ResultChar);
518 } else {
519 Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
520 "x"+utohexstr(ResultChar));
521 }
522 }
523
524 // Note: our internal rep of wide char tokens is always little-endian.
525 *ResultPtr++ = ResultChar & 0xFF;
526
527 if (AnyWide) {
528 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
529 *ResultPtr++ = ResultChar >> i*8;
530 }
531 }
532 }
533
534 // Add zero terminator.
535 *ResultPtr = 0;
536 if (AnyWide) {
537 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
538 *ResultPtr++ = 0;
539 }
540}
541
542void StringLiteralParser::Diag(SourceLocation Loc, unsigned DiagID,
543 const std::string &M) {
544 PP.Diag(Loc, DiagID, M);
545 hadError = true;
546}
547