blob: befd3206f3661ecbeac6d71784c88a35276e3187 [file] [log] [blame]
Steve Naroff09ef4742007-03-09 23:16:33 +00001//===--- LiteralSupport.cpp - Code to parse and process literals-*- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file was developed by Steve Naroff and is distributed under
6// the University of Illinois Open Source License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements the NumericLiteralParser interface.
11//
12//===----------------------------------------------------------------------===//
13
14#include "clang/Lex/LiteralSupport.h"
15#include "clang/Lex/Preprocessor.h"
16#include "clang/Basic/TargetInfo.h"
17#include "clang/Basic/Diagnostic.h"
Chris Lattner5b743d32007-04-04 05:52:58 +000018#include "llvm/ADT/APInt.h"
Steve Naroff4f88b312007-03-13 22:37:02 +000019#include "llvm/ADT/StringExtras.h"
Steve Naroff09ef4742007-03-09 23:16:33 +000020using namespace llvm;
21using namespace clang;
22
23/// integer-constant: [C99 6.4.4.1]
24/// decimal-constant integer-suffix
25/// octal-constant integer-suffix
26/// hexadecimal-constant integer-suffix
27/// decimal-constant:
28/// nonzero-digit
29/// decimal-constant digit
30/// octal-constant:
31/// 0
32/// octal-constant octal-digit
33/// hexadecimal-constant:
34/// hexadecimal-prefix hexadecimal-digit
35/// hexadecimal-constant hexadecimal-digit
36/// hexadecimal-prefix: one of
37/// 0x 0X
38/// integer-suffix:
39/// unsigned-suffix [long-suffix]
40/// unsigned-suffix [long-long-suffix]
41/// long-suffix [unsigned-suffix]
42/// long-long-suffix [unsigned-sufix]
43/// nonzero-digit:
44/// 1 2 3 4 5 6 7 8 9
45/// octal-digit:
46/// 0 1 2 3 4 5 6 7
47/// hexadecimal-digit:
48/// 0 1 2 3 4 5 6 7 8 9
49/// a b c d e f
50/// A B C D E F
51/// unsigned-suffix: one of
52/// u U
53/// long-suffix: one of
54/// l L
55/// long-long-suffix: one of
56/// ll LL
57///
58/// floating-constant: [C99 6.4.4.2]
59/// TODO: add rules...
60///
61
62NumericLiteralParser::
63NumericLiteralParser(const char *begin, const char *end,
Steve Naroff451d8f162007-03-12 23:22:38 +000064 SourceLocation TokLoc, Preprocessor &pp) :
65 PP(pp), ThisTokBegin(begin), ThisTokEnd(end)
Steve Naroff09ef4742007-03-09 23:16:33 +000066{
67 s = DigitsBegin = begin;
68 saw_exponent = false;
69 saw_period = false;
70 saw_float_suffix = false;
71 isLong = false;
72 isUnsigned = false;
73 isLongLong = false;
74 hadError = false;
75
76 if (*s == '0') { // parse radix
77 s++;
78 if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) {
79 s++;
80 radix = 16;
81 DigitsBegin = s;
82 s = SkipHexDigits(s);
83 if (s == ThisTokEnd) {
84 } else if (*s == '.') {
85 s++;
86 saw_period = true;
87 s = SkipHexDigits(s);
88 }
89 // A binary exponent can appear with or with a '.'. If dotted, the
90 // binary exponent is required.
91 if (*s == 'p' || *s == 'P') {
92 s++;
93 saw_exponent = true;
94 if (*s == '+' || *s == '-') s++; // sign
95 const char *first_non_digit = SkipDigits(s);
96 if (first_non_digit == s) {
97 Diag(TokLoc, diag::err_exponent_has_no_digits);
98 return;
99 } else {
100 s = first_non_digit;
101 }
102 } else if (saw_period) {
103 Diag(TokLoc, diag::err_hexconstant_requires_exponent);
104 return;
105 }
106 } else {
107 // For now, the radix is set to 8. If we discover that we have a
108 // floating point constant, the radix will change to 10. Octal floating
109 // point constants are not permitted (only decimal and hexadecimal).
110 radix = 8;
111 DigitsBegin = s;
112 s = SkipOctalDigits(s);
113 if (s == ThisTokEnd) {
114 } else if (*s == '.') {
115 s++;
116 radix = 10;
117 saw_period = true;
118 s = SkipDigits(s);
119 }
120 if (*s == 'e' || *s == 'E') { // exponent
121 s++;
122 radix = 10;
123 saw_exponent = true;
124 if (*s == '+' || *s == '-') s++; // sign
125 const char *first_non_digit = SkipDigits(s);
126 if (first_non_digit == s) {
127 Diag(TokLoc, diag::err_exponent_has_no_digits);
128 return;
129 } else {
130 s = first_non_digit;
131 }
132 }
133 }
134 } else { // the first digit is non-zero
135 radix = 10;
136 s = SkipDigits(s);
137 if (s == ThisTokEnd) {
138 } else if (*s == '.') {
139 s++;
140 saw_period = true;
141 s = SkipDigits(s);
142 }
143 if (*s == 'e' || *s == 'E') { // exponent
144 s++;
145 saw_exponent = true;
146 if (*s == '+' || *s == '-') s++; // sign
147 const char *first_non_digit = SkipDigits(s);
148 if (first_non_digit == s) {
149 Diag(TokLoc, diag::err_exponent_has_no_digits);
150 return;
151 } else {
152 s = first_non_digit;
153 }
154 }
155 }
156
157 SuffixBegin = s;
158
159 if (saw_period || saw_exponent) {
160 if (s < ThisTokEnd) { // parse size suffix (float, long double)
161 if (*s == 'f' || *s == 'F') {
162 saw_float_suffix = true;
163 s++;
164 } else if (*s == 'l' || *s == 'L') {
165 isLong = true;
166 s++;
167 }
168 if (s != ThisTokEnd) {
169 Diag(TokLoc, diag::err_invalid_suffix_float_constant,
170 std::string(SuffixBegin, ThisTokEnd));
171 return;
172 }
173 }
174 } else {
175 if (s < ThisTokEnd) {
176 // parse int suffix - they can appear in any order ("ul", "lu", "llu").
177 if (*s == 'u' || *s == 'U') {
178 s++;
179 isUnsigned = true; // unsigned
180
181 if ((s < ThisTokEnd) && (*s == 'l' || *s == 'L')) {
182 s++;
183 // handle "long long" type - l's need to be adjacent and same case.
184 if ((s < ThisTokEnd) && (*s == *(s-1))) {
185 isLongLong = true; // unsigned long long
186 s++;
187 } else {
188 isLong = true; // unsigned long
189 }
190 }
191 } else if (*s == 'l' || *s == 'L') {
192 s++;
193 // handle "long long" types - l's need to be adjacent and same case.
194 if ((s < ThisTokEnd) && (*s == *(s-1))) {
195 s++;
196 if ((s < ThisTokEnd) && (*s == 'u' || *s == 'U')) {
197 isUnsigned = true; // unsigned long long
198 s++;
199 } else {
200 isLongLong = true; // long long
201 }
202 } else { // handle "long" types
203 if ((s < ThisTokEnd) && (*s == 'u' || *s == 'U')) {
204 isUnsigned = true; // unsigned long
205 s++;
206 } else {
207 isLong = true; // long
208 }
209 }
210 }
211 if (s != ThisTokEnd) {
212 Diag(TokLoc, diag::err_invalid_suffix_integer_constant,
213 std::string(SuffixBegin, ThisTokEnd));
214 return;
215 }
216 }
217 }
218}
219
Chris Lattner5b743d32007-04-04 05:52:58 +0000220static unsigned HexLetterToVal(char c) {
221 if (c >= '0' && c <= '9')
222 return c - '0';
223 else if (c >= 'A' && c <= 'F')
224 return c - 'A' - 10;
225 else
226 assert(c >= 'a' && c <= 'f' && "Lexer scanning error");
227 return c - 'a' - 10;
228}
229
Steve Naroff451d8f162007-03-12 23:22:38 +0000230bool NumericLiteralParser::GetIntegerValue(uintmax_t &val) {
Steve Narofff2fb89e2007-03-13 20:29:44 +0000231 uintmax_t max_value = UINTMAX_MAX / radix;
Chris Lattner5b743d32007-04-04 05:52:58 +0000232 unsigned max_digit = UINTMAX_MAX % radix;
Steve Naroff09ef4742007-03-09 23:16:33 +0000233
234 val = 0;
Steve Naroff451d8f162007-03-12 23:22:38 +0000235 s = DigitsBegin;
236 while (s < SuffixBegin) {
Chris Lattner5b743d32007-04-04 05:52:58 +0000237 unsigned C = HexLetterToVal(*s++);
Steve Naroff451d8f162007-03-12 23:22:38 +0000238
Chris Lattner5b743d32007-04-04 05:52:58 +0000239 if (val > max_value || (val == max_value && C > max_digit)) {
Steve Naroff451d8f162007-03-12 23:22:38 +0000240 return false; // Overflow!
241 } else {
242 val *= radix;
Chris Lattner5b743d32007-04-04 05:52:58 +0000243 val += C;
Steve Naroff451d8f162007-03-12 23:22:38 +0000244 }
245 }
246 return true;
247}
248
249bool NumericLiteralParser::GetIntegerValue(int &val) {
Steve Narofff2fb89e2007-03-13 20:29:44 +0000250 intmax_t max_value = INT_MAX / radix;
Chris Lattner5b743d32007-04-04 05:52:58 +0000251 unsigned max_digit = INT_MAX % radix;
Steve Naroff451d8f162007-03-12 23:22:38 +0000252
253 val = 0;
Steve Naroff09ef4742007-03-09 23:16:33 +0000254 s = DigitsBegin;
255 while (s < SuffixBegin) {
Chris Lattner5b743d32007-04-04 05:52:58 +0000256 unsigned C = HexLetterToVal(*s++);
Steve Naroff09ef4742007-03-09 23:16:33 +0000257
Chris Lattner5b743d32007-04-04 05:52:58 +0000258 if (val > max_value || (val == max_value && C > max_digit)) {
Steve Naroff09ef4742007-03-09 23:16:33 +0000259 return false; // Overflow!
260 } else {
261 val *= radix;
Chris Lattner5b743d32007-04-04 05:52:58 +0000262 val += C;
Steve Naroff09ef4742007-03-09 23:16:33 +0000263 }
264 }
265 return true;
266}
Steve Narofff2fb89e2007-03-13 20:29:44 +0000267
Chris Lattner5b743d32007-04-04 05:52:58 +0000268/// GetIntegerValue - Convert this numeric literal value to an APInt that
Chris Lattner871b4e12007-04-04 06:36:34 +0000269/// matches Val's input width. If there is an overflow, set Val to the low bits
270/// of the result and return true. Otherwise, return false.
Chris Lattner5b743d32007-04-04 05:52:58 +0000271bool NumericLiteralParser::GetIntegerValue(APInt &Val) {
272 Val = 0;
273 s = DigitsBegin;
274
Chris Lattner5b743d32007-04-04 05:52:58 +0000275 APInt RadixVal(Val.getBitWidth(), radix);
276 APInt CharVal(Val.getBitWidth(), 0);
277 APInt OldVal = Val;
Chris Lattner871b4e12007-04-04 06:36:34 +0000278
279 bool OverflowOccurred = false;
Chris Lattner5b743d32007-04-04 05:52:58 +0000280 while (s < SuffixBegin) {
281 unsigned C = HexLetterToVal(*s++);
282
283 // If this letter is out of bound for this radix, reject it.
Chris Lattner531efa42007-04-04 06:49:26 +0000284 assert(C < radix && "NumericLiteralParser ctor should have rejected this");
Chris Lattner5b743d32007-04-04 05:52:58 +0000285
286 CharVal = C;
287
Chris Lattner871b4e12007-04-04 06:36:34 +0000288 // Add the digit to the value in the appropriate radix. If adding in digits
289 // made the value smaller, then this overflowed.
Chris Lattner5b743d32007-04-04 05:52:58 +0000290 OldVal = Val;
Chris Lattner871b4e12007-04-04 06:36:34 +0000291
292 // Multiply by radix, did overflow occur on the multiply?
Chris Lattner5b743d32007-04-04 05:52:58 +0000293 Val *= RadixVal;
Chris Lattner871b4e12007-04-04 06:36:34 +0000294 OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
295
296 OldVal = Val;
297 // Add value, did overflow occur on the value?
Chris Lattner5b743d32007-04-04 05:52:58 +0000298 Val += CharVal;
Chris Lattner871b4e12007-04-04 06:36:34 +0000299 OverflowOccurred |= Val.ult(OldVal);
300 OverflowOccurred |= Val.ult(CharVal);
Chris Lattner5b743d32007-04-04 05:52:58 +0000301 }
Chris Lattner871b4e12007-04-04 06:36:34 +0000302 return OverflowOccurred;
Chris Lattner5b743d32007-04-04 05:52:58 +0000303}
304
305
Steve Narofff2fb89e2007-03-13 20:29:44 +0000306void NumericLiteralParser::Diag(SourceLocation Loc, unsigned DiagID,
307 const std::string &M) {
308 PP.Diag(Loc, DiagID, M);
309 hadError = true;
310}
Steve Naroff4f88b312007-03-13 22:37:02 +0000311
312/// string-literal: [C99 6.4.5]
313/// " [s-char-sequence] "
314/// L" [s-char-sequence] "
315/// s-char-sequence:
316/// s-char
317/// s-char-sequence s-char
318/// s-char:
319/// any source character except the double quote ",
320/// backslash \, or newline character
321/// escape-character
322/// universal-character-name
323/// escape-character: [C99 6.4.4.4]
324/// \ escape-code
325/// universal-character-name
326/// escape-code:
327/// character-escape-code
328/// octal-escape-code
329/// hex-escape-code
330/// character-escape-code: one of
331/// n t b r f v a
332/// \ ' " ?
333/// octal-escape-code:
334/// octal-digit
335/// octal-digit octal-digit
336/// octal-digit octal-digit octal-digit
337/// hex-escape-code:
338/// x hex-digit
339/// hex-escape-code hex-digit
340/// universal-character-name:
341/// \u hex-quad
342/// \U hex-quad hex-quad
343/// hex-quad:
344/// hex-digit hex-digit hex-digit hex-digit
345
346StringLiteralParser::
347StringLiteralParser(const LexerToken *StringToks, unsigned NumStringToks,
348 Preprocessor &pp, TargetInfo &t) :
349 PP(pp), Target(t)
350{
351 // Scan all of the string portions, remember the max individual token length,
352 // computing a bound on the concatenated string length, and see whether any
353 // piece is a wide-string. If any of the string portions is a wide-string
354 // literal, the result is a wide-string literal [C99 6.4.5p4].
355 MaxTokenLength = StringToks[0].getLength();
356 SizeBound = StringToks[0].getLength()-2; // -2 for "".
357 AnyWide = StringToks[0].getKind() == tok::wide_string_literal;
358
Steve Narofff1e53692007-03-23 22:27:02 +0000359 hadError = false;
360
Steve Naroff4f88b312007-03-13 22:37:02 +0000361 // The common case is that there is only one string fragment.
362 for (unsigned i = 1; i != NumStringToks; ++i) {
363 // The string could be shorter than this if it needs cleaning, but this is a
364 // reasonable bound, which is all we need.
365 SizeBound += StringToks[i].getLength()-2; // -2 for "".
366
367 // Remember maximum string piece length.
368 if (StringToks[i].getLength() > MaxTokenLength)
369 MaxTokenLength = StringToks[i].getLength();
370
371 // Remember if we see any wide strings.
372 AnyWide |= StringToks[i].getKind() == tok::wide_string_literal;
373 }
374
375
376 // Include space for the null terminator.
377 ++SizeBound;
378
379 // TODO: K&R warning: "traditional C rejects string constant concatenation"
380
381 // Get the width in bytes of wchar_t. If no wchar_t strings are used, do not
382 // query the target. As such, wchar_tByteWidth is only valid if AnyWide=true.
383 wchar_tByteWidth = ~0U;
384 if (AnyWide)
385 wchar_tByteWidth = Target.getWCharWidth(StringToks[0].getLocation());
386
387 // The output buffer size needs to be large enough to hold wide characters.
388 // This is a worst-case assumption which basically corresponds to L"" "long".
389 if (AnyWide)
390 SizeBound *= wchar_tByteWidth;
391
392 // Size the temporary buffer to hold the result string data.
393 ResultBuf.resize(SizeBound);
394
395 // Likewise, but for each string piece.
396 SmallString<512> TokenBuf;
397 TokenBuf.resize(MaxTokenLength);
398
399 // Loop over all the strings, getting their spelling, and expanding them to
400 // wide strings as appropriate.
401 ResultPtr = &ResultBuf[0]; // Next byte to fill in.
402
403 for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
404 const char *ThisTokBuf = &TokenBuf[0];
405 // Get the spelling of the token, which eliminates trigraphs, etc. We know
406 // that ThisTokBuf points to a buffer that is big enough for the whole token
407 // and 'spelled' tokens can only shrink.
408 unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf);
409 const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end quote.
410
411 // TODO: Input character set mapping support.
412
413 // Skip L marker for wide strings.
414 if (ThisTokBuf[0] == 'L') ++ThisTokBuf;
415
416 assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
417 ++ThisTokBuf;
418
419 while (ThisTokBuf != ThisTokEnd) {
420 // Is this a span of non-escape characters?
421 if (ThisTokBuf[0] != '\\') {
422 const char *InStart = ThisTokBuf;
423 do {
424 ++ThisTokBuf;
425 } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
426
427 // Copy the character span over.
428 unsigned Len = ThisTokBuf-InStart;
429 if (!AnyWide) {
430 memcpy(ResultPtr, InStart, Len);
431 ResultPtr += Len;
432 } else {
433 // Note: our internal rep of wide char tokens is always little-endian.
434 for (; Len; --Len, ++InStart) {
435 *ResultPtr++ = InStart[0];
436 // Add zeros at the end.
437 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
438 *ResultPtr++ = 0;
439 }
440 }
441 continue;
442 }
443
444 // Otherwise, this is an escape character. Skip the '\' char.
445 ++ThisTokBuf;
446
447 // We know that this character can't be off the end of the buffer, because
448 // that would have been \", which would not have been the end of string.
449 unsigned ResultChar = *ThisTokBuf++;
450 switch (ResultChar) {
451 // These map to themselves.
452 case '\\': case '\'': case '"': case '?': break;
453
454 // These have fixed mappings.
455 case 'a':
456 // TODO: K&R: the meaning of '\\a' is different in traditional C
457 ResultChar = 7;
458 break;
459 case 'b':
460 ResultChar = 8;
461 break;
462 case 'e':
463 Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape, "e");
464 ResultChar = 27;
465 break;
466 case 'f':
467 ResultChar = 12;
468 break;
469 case 'n':
470 ResultChar = 10;
471 break;
472 case 'r':
473 ResultChar = 13;
474 break;
475 case 't':
476 ResultChar = 9;
477 break;
478 case 'v':
479 ResultChar = 11;
480 break;
481
482 //case 'u': case 'U': // FIXME: UCNs.
483 case 'x': // Hex escape.
484 if (ThisTokBuf == ThisTokEnd ||
485 (ResultChar = HexDigitValue(*ThisTokBuf)) == ~0U) {
486 Diag(StringToks[i].getLocation(), diag::err_hex_escape_no_digits);
487 ResultChar = 0;
488 break;
489 }
490 ++ThisTokBuf; // Consumed one hex digit.
491
492 assert(0 && "hex escape: unimp!");
493 break;
494 case '0': case '1': case '2': case '3':
495 case '4': case '5': case '6': case '7':
496 // Octal escapes.
497 assert(0 && "octal escape: unimp!");
498 break;
499
500 // Otherwise, these are not valid escapes.
501 case '(': case '{': case '[': case '%':
502 // GCC accepts these as extensions. We warn about them as such though.
503 if (!PP.getLangOptions().NoExtensions) {
504 Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape,
505 std::string()+(char)ResultChar);
506 break;
507 }
508 // FALL THROUGH.
509 default:
510 if (isgraph(ThisTokBuf[0])) {
511 Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
512 std::string()+(char)ResultChar);
513 } else {
514 Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
515 "x"+utohexstr(ResultChar));
516 }
517 }
518
519 // Note: our internal rep of wide char tokens is always little-endian.
520 *ResultPtr++ = ResultChar & 0xFF;
521
522 if (AnyWide) {
523 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
524 *ResultPtr++ = ResultChar >> i*8;
525 }
526 }
527 }
528
529 // Add zero terminator.
530 *ResultPtr = 0;
531 if (AnyWide) {
532 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
533 *ResultPtr++ = 0;
534 }
535}
536
537void StringLiteralParser::Diag(SourceLocation Loc, unsigned DiagID,
538 const std::string &M) {
539 PP.Diag(Loc, DiagID, M);
540 hadError = true;
541}
542