blob: 286bcf6a3b09971a9a7cb931d7c6b4e84c81e9f2 [file] [log] [blame]
Steve Naroff09ef4742007-03-09 23:16:33 +00001//===--- LiteralSupport.cpp - Code to parse and process literals-*- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file was developed by Steve Naroff and is distributed under
6// the University of Illinois Open Source License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements the NumericLiteralParser interface.
11//
12//===----------------------------------------------------------------------===//
13
14#include "clang/Lex/LiteralSupport.h"
15#include "clang/Lex/Preprocessor.h"
16#include "clang/Basic/TargetInfo.h"
17#include "clang/Basic/Diagnostic.h"
Chris Lattner5b743d32007-04-04 05:52:58 +000018#include "llvm/ADT/APInt.h"
Steve Naroff4f88b312007-03-13 22:37:02 +000019#include "llvm/ADT/StringExtras.h"
Steve Naroff09ef4742007-03-09 23:16:33 +000020using namespace llvm;
21using namespace clang;
22
23/// integer-constant: [C99 6.4.4.1]
24/// decimal-constant integer-suffix
25/// octal-constant integer-suffix
26/// hexadecimal-constant integer-suffix
27/// decimal-constant:
28/// nonzero-digit
29/// decimal-constant digit
30/// octal-constant:
31/// 0
32/// octal-constant octal-digit
33/// hexadecimal-constant:
34/// hexadecimal-prefix hexadecimal-digit
35/// hexadecimal-constant hexadecimal-digit
36/// hexadecimal-prefix: one of
37/// 0x 0X
38/// integer-suffix:
39/// unsigned-suffix [long-suffix]
40/// unsigned-suffix [long-long-suffix]
41/// long-suffix [unsigned-suffix]
42/// long-long-suffix [unsigned-sufix]
43/// nonzero-digit:
44/// 1 2 3 4 5 6 7 8 9
45/// octal-digit:
46/// 0 1 2 3 4 5 6 7
47/// hexadecimal-digit:
48/// 0 1 2 3 4 5 6 7 8 9
49/// a b c d e f
50/// A B C D E F
51/// unsigned-suffix: one of
52/// u U
53/// long-suffix: one of
54/// l L
55/// long-long-suffix: one of
56/// ll LL
57///
58/// floating-constant: [C99 6.4.4.2]
59/// TODO: add rules...
60///
61
62NumericLiteralParser::
63NumericLiteralParser(const char *begin, const char *end,
Steve Naroff451d8f162007-03-12 23:22:38 +000064 SourceLocation TokLoc, Preprocessor &pp) :
65 PP(pp), ThisTokBegin(begin), ThisTokEnd(end)
Steve Naroff09ef4742007-03-09 23:16:33 +000066{
67 s = DigitsBegin = begin;
68 saw_exponent = false;
69 saw_period = false;
70 saw_float_suffix = false;
71 isLong = false;
72 isUnsigned = false;
73 isLongLong = false;
74 hadError = false;
75
76 if (*s == '0') { // parse radix
77 s++;
78 if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) {
79 s++;
80 radix = 16;
81 DigitsBegin = s;
82 s = SkipHexDigits(s);
83 if (s == ThisTokEnd) {
84 } else if (*s == '.') {
85 s++;
86 saw_period = true;
87 s = SkipHexDigits(s);
88 }
89 // A binary exponent can appear with or with a '.'. If dotted, the
90 // binary exponent is required.
91 if (*s == 'p' || *s == 'P') {
92 s++;
93 saw_exponent = true;
94 if (*s == '+' || *s == '-') s++; // sign
95 const char *first_non_digit = SkipDigits(s);
96 if (first_non_digit == s) {
97 Diag(TokLoc, diag::err_exponent_has_no_digits);
98 return;
99 } else {
100 s = first_non_digit;
101 }
102 } else if (saw_period) {
103 Diag(TokLoc, diag::err_hexconstant_requires_exponent);
104 return;
105 }
106 } else {
107 // For now, the radix is set to 8. If we discover that we have a
108 // floating point constant, the radix will change to 10. Octal floating
109 // point constants are not permitted (only decimal and hexadecimal).
110 radix = 8;
111 DigitsBegin = s;
112 s = SkipOctalDigits(s);
113 if (s == ThisTokEnd) {
114 } else if (*s == '.') {
115 s++;
116 radix = 10;
117 saw_period = true;
118 s = SkipDigits(s);
119 }
120 if (*s == 'e' || *s == 'E') { // exponent
121 s++;
122 radix = 10;
123 saw_exponent = true;
124 if (*s == '+' || *s == '-') s++; // sign
125 const char *first_non_digit = SkipDigits(s);
126 if (first_non_digit == s) {
127 Diag(TokLoc, diag::err_exponent_has_no_digits);
128 return;
129 } else {
130 s = first_non_digit;
131 }
132 }
133 }
134 } else { // the first digit is non-zero
135 radix = 10;
136 s = SkipDigits(s);
137 if (s == ThisTokEnd) {
138 } else if (*s == '.') {
139 s++;
140 saw_period = true;
141 s = SkipDigits(s);
142 }
143 if (*s == 'e' || *s == 'E') { // exponent
144 s++;
145 saw_exponent = true;
146 if (*s == '+' || *s == '-') s++; // sign
147 const char *first_non_digit = SkipDigits(s);
148 if (first_non_digit == s) {
149 Diag(TokLoc, diag::err_exponent_has_no_digits);
150 return;
151 } else {
152 s = first_non_digit;
153 }
154 }
155 }
156
157 SuffixBegin = s;
158
159 if (saw_period || saw_exponent) {
160 if (s < ThisTokEnd) { // parse size suffix (float, long double)
161 if (*s == 'f' || *s == 'F') {
162 saw_float_suffix = true;
163 s++;
164 } else if (*s == 'l' || *s == 'L') {
165 isLong = true;
166 s++;
167 }
168 if (s != ThisTokEnd) {
169 Diag(TokLoc, diag::err_invalid_suffix_float_constant,
170 std::string(SuffixBegin, ThisTokEnd));
171 return;
172 }
173 }
174 } else {
175 if (s < ThisTokEnd) {
176 // parse int suffix - they can appear in any order ("ul", "lu", "llu").
177 if (*s == 'u' || *s == 'U') {
178 s++;
179 isUnsigned = true; // unsigned
180
181 if ((s < ThisTokEnd) && (*s == 'l' || *s == 'L')) {
182 s++;
183 // handle "long long" type - l's need to be adjacent and same case.
184 if ((s < ThisTokEnd) && (*s == *(s-1))) {
185 isLongLong = true; // unsigned long long
186 s++;
187 } else {
188 isLong = true; // unsigned long
189 }
190 }
191 } else if (*s == 'l' || *s == 'L') {
192 s++;
193 // handle "long long" types - l's need to be adjacent and same case.
194 if ((s < ThisTokEnd) && (*s == *(s-1))) {
195 s++;
196 if ((s < ThisTokEnd) && (*s == 'u' || *s == 'U')) {
197 isUnsigned = true; // unsigned long long
198 s++;
199 } else {
200 isLongLong = true; // long long
201 }
202 } else { // handle "long" types
203 if ((s < ThisTokEnd) && (*s == 'u' || *s == 'U')) {
204 isUnsigned = true; // unsigned long
205 s++;
206 } else {
207 isLong = true; // long
208 }
209 }
210 }
211 if (s != ThisTokEnd) {
212 Diag(TokLoc, diag::err_invalid_suffix_integer_constant,
213 std::string(SuffixBegin, ThisTokEnd));
214 return;
215 }
216 }
217 }
218}
219
Chris Lattner5b743d32007-04-04 05:52:58 +0000220static unsigned HexLetterToVal(char c) {
221 if (c >= '0' && c <= '9')
222 return c - '0';
223 else if (c >= 'A' && c <= 'F')
224 return c - 'A' - 10;
225 else
226 assert(c >= 'a' && c <= 'f' && "Lexer scanning error");
227 return c - 'a' - 10;
228}
229
Steve Naroff451d8f162007-03-12 23:22:38 +0000230bool NumericLiteralParser::GetIntegerValue(uintmax_t &val) {
Steve Narofff2fb89e2007-03-13 20:29:44 +0000231 uintmax_t max_value = UINTMAX_MAX / radix;
Chris Lattner5b743d32007-04-04 05:52:58 +0000232 unsigned max_digit = UINTMAX_MAX % radix;
Steve Naroff09ef4742007-03-09 23:16:33 +0000233
234 val = 0;
Steve Naroff451d8f162007-03-12 23:22:38 +0000235 s = DigitsBegin;
236 while (s < SuffixBegin) {
Chris Lattner5b743d32007-04-04 05:52:58 +0000237 unsigned C = HexLetterToVal(*s++);
Steve Naroff451d8f162007-03-12 23:22:38 +0000238
Chris Lattner5b743d32007-04-04 05:52:58 +0000239 if (val > max_value || (val == max_value && C > max_digit)) {
Steve Naroff451d8f162007-03-12 23:22:38 +0000240 return false; // Overflow!
241 } else {
242 val *= radix;
Chris Lattner5b743d32007-04-04 05:52:58 +0000243 val += C;
Steve Naroff451d8f162007-03-12 23:22:38 +0000244 }
245 }
246 return true;
247}
248
249bool NumericLiteralParser::GetIntegerValue(int &val) {
Steve Narofff2fb89e2007-03-13 20:29:44 +0000250 intmax_t max_value = INT_MAX / radix;
Chris Lattner5b743d32007-04-04 05:52:58 +0000251 unsigned max_digit = INT_MAX % radix;
Steve Naroff451d8f162007-03-12 23:22:38 +0000252
253 val = 0;
Steve Naroff09ef4742007-03-09 23:16:33 +0000254 s = DigitsBegin;
255 while (s < SuffixBegin) {
Chris Lattner5b743d32007-04-04 05:52:58 +0000256 unsigned C = HexLetterToVal(*s++);
Steve Naroff09ef4742007-03-09 23:16:33 +0000257
Chris Lattner5b743d32007-04-04 05:52:58 +0000258 if (val > max_value || (val == max_value && C > max_digit)) {
Steve Naroff09ef4742007-03-09 23:16:33 +0000259 return false; // Overflow!
260 } else {
261 val *= radix;
Chris Lattner5b743d32007-04-04 05:52:58 +0000262 val += C;
Steve Naroff09ef4742007-03-09 23:16:33 +0000263 }
264 }
265 return true;
266}
Steve Narofff2fb89e2007-03-13 20:29:44 +0000267
Chris Lattner5b743d32007-04-04 05:52:58 +0000268/// GetIntegerValue - Convert this numeric literal value to an APInt that
269/// matches Val's input width. If there is an overflow, saturate Val to zero
270/// and return false. Otherwise, set Val and return true.
271bool NumericLiteralParser::GetIntegerValue(APInt &Val) {
272 Val = 0;
273 s = DigitsBegin;
274
275 // FIXME: This doesn't handle sign right, doesn't autopromote to wider
276 // integer, and is generally not conformant.
277 APInt RadixVal(Val.getBitWidth(), radix);
278 APInt CharVal(Val.getBitWidth(), 0);
279 APInt OldVal = Val;
280 while (s < SuffixBegin) {
281 unsigned C = HexLetterToVal(*s++);
282
283 // If this letter is out of bound for this radix, reject it.
284 if (C >= radix) { Val = 0; return false; }
285
286 CharVal = C;
287
288 OldVal = Val;
289 Val *= RadixVal;
290 Val += CharVal;
291 if (OldVal.ugt(Val))
292 return false; // Overflow!
293 }
294 return true;
295}
296
297
Steve Narofff2fb89e2007-03-13 20:29:44 +0000298void NumericLiteralParser::Diag(SourceLocation Loc, unsigned DiagID,
299 const std::string &M) {
300 PP.Diag(Loc, DiagID, M);
301 hadError = true;
302}
Steve Naroff4f88b312007-03-13 22:37:02 +0000303
304/// string-literal: [C99 6.4.5]
305/// " [s-char-sequence] "
306/// L" [s-char-sequence] "
307/// s-char-sequence:
308/// s-char
309/// s-char-sequence s-char
310/// s-char:
311/// any source character except the double quote ",
312/// backslash \, or newline character
313/// escape-character
314/// universal-character-name
315/// escape-character: [C99 6.4.4.4]
316/// \ escape-code
317/// universal-character-name
318/// escape-code:
319/// character-escape-code
320/// octal-escape-code
321/// hex-escape-code
322/// character-escape-code: one of
323/// n t b r f v a
324/// \ ' " ?
325/// octal-escape-code:
326/// octal-digit
327/// octal-digit octal-digit
328/// octal-digit octal-digit octal-digit
329/// hex-escape-code:
330/// x hex-digit
331/// hex-escape-code hex-digit
332/// universal-character-name:
333/// \u hex-quad
334/// \U hex-quad hex-quad
335/// hex-quad:
336/// hex-digit hex-digit hex-digit hex-digit
337
338StringLiteralParser::
339StringLiteralParser(const LexerToken *StringToks, unsigned NumStringToks,
340 Preprocessor &pp, TargetInfo &t) :
341 PP(pp), Target(t)
342{
343 // Scan all of the string portions, remember the max individual token length,
344 // computing a bound on the concatenated string length, and see whether any
345 // piece is a wide-string. If any of the string portions is a wide-string
346 // literal, the result is a wide-string literal [C99 6.4.5p4].
347 MaxTokenLength = StringToks[0].getLength();
348 SizeBound = StringToks[0].getLength()-2; // -2 for "".
349 AnyWide = StringToks[0].getKind() == tok::wide_string_literal;
350
Steve Narofff1e53692007-03-23 22:27:02 +0000351 hadError = false;
352
Steve Naroff4f88b312007-03-13 22:37:02 +0000353 // The common case is that there is only one string fragment.
354 for (unsigned i = 1; i != NumStringToks; ++i) {
355 // The string could be shorter than this if it needs cleaning, but this is a
356 // reasonable bound, which is all we need.
357 SizeBound += StringToks[i].getLength()-2; // -2 for "".
358
359 // Remember maximum string piece length.
360 if (StringToks[i].getLength() > MaxTokenLength)
361 MaxTokenLength = StringToks[i].getLength();
362
363 // Remember if we see any wide strings.
364 AnyWide |= StringToks[i].getKind() == tok::wide_string_literal;
365 }
366
367
368 // Include space for the null terminator.
369 ++SizeBound;
370
371 // TODO: K&R warning: "traditional C rejects string constant concatenation"
372
373 // Get the width in bytes of wchar_t. If no wchar_t strings are used, do not
374 // query the target. As such, wchar_tByteWidth is only valid if AnyWide=true.
375 wchar_tByteWidth = ~0U;
376 if (AnyWide)
377 wchar_tByteWidth = Target.getWCharWidth(StringToks[0].getLocation());
378
379 // The output buffer size needs to be large enough to hold wide characters.
380 // This is a worst-case assumption which basically corresponds to L"" "long".
381 if (AnyWide)
382 SizeBound *= wchar_tByteWidth;
383
384 // Size the temporary buffer to hold the result string data.
385 ResultBuf.resize(SizeBound);
386
387 // Likewise, but for each string piece.
388 SmallString<512> TokenBuf;
389 TokenBuf.resize(MaxTokenLength);
390
391 // Loop over all the strings, getting their spelling, and expanding them to
392 // wide strings as appropriate.
393 ResultPtr = &ResultBuf[0]; // Next byte to fill in.
394
395 for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
396 const char *ThisTokBuf = &TokenBuf[0];
397 // Get the spelling of the token, which eliminates trigraphs, etc. We know
398 // that ThisTokBuf points to a buffer that is big enough for the whole token
399 // and 'spelled' tokens can only shrink.
400 unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf);
401 const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end quote.
402
403 // TODO: Input character set mapping support.
404
405 // Skip L marker for wide strings.
406 if (ThisTokBuf[0] == 'L') ++ThisTokBuf;
407
408 assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
409 ++ThisTokBuf;
410
411 while (ThisTokBuf != ThisTokEnd) {
412 // Is this a span of non-escape characters?
413 if (ThisTokBuf[0] != '\\') {
414 const char *InStart = ThisTokBuf;
415 do {
416 ++ThisTokBuf;
417 } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
418
419 // Copy the character span over.
420 unsigned Len = ThisTokBuf-InStart;
421 if (!AnyWide) {
422 memcpy(ResultPtr, InStart, Len);
423 ResultPtr += Len;
424 } else {
425 // Note: our internal rep of wide char tokens is always little-endian.
426 for (; Len; --Len, ++InStart) {
427 *ResultPtr++ = InStart[0];
428 // Add zeros at the end.
429 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
430 *ResultPtr++ = 0;
431 }
432 }
433 continue;
434 }
435
436 // Otherwise, this is an escape character. Skip the '\' char.
437 ++ThisTokBuf;
438
439 // We know that this character can't be off the end of the buffer, because
440 // that would have been \", which would not have been the end of string.
441 unsigned ResultChar = *ThisTokBuf++;
442 switch (ResultChar) {
443 // These map to themselves.
444 case '\\': case '\'': case '"': case '?': break;
445
446 // These have fixed mappings.
447 case 'a':
448 // TODO: K&R: the meaning of '\\a' is different in traditional C
449 ResultChar = 7;
450 break;
451 case 'b':
452 ResultChar = 8;
453 break;
454 case 'e':
455 Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape, "e");
456 ResultChar = 27;
457 break;
458 case 'f':
459 ResultChar = 12;
460 break;
461 case 'n':
462 ResultChar = 10;
463 break;
464 case 'r':
465 ResultChar = 13;
466 break;
467 case 't':
468 ResultChar = 9;
469 break;
470 case 'v':
471 ResultChar = 11;
472 break;
473
474 //case 'u': case 'U': // FIXME: UCNs.
475 case 'x': // Hex escape.
476 if (ThisTokBuf == ThisTokEnd ||
477 (ResultChar = HexDigitValue(*ThisTokBuf)) == ~0U) {
478 Diag(StringToks[i].getLocation(), diag::err_hex_escape_no_digits);
479 ResultChar = 0;
480 break;
481 }
482 ++ThisTokBuf; // Consumed one hex digit.
483
484 assert(0 && "hex escape: unimp!");
485 break;
486 case '0': case '1': case '2': case '3':
487 case '4': case '5': case '6': case '7':
488 // Octal escapes.
489 assert(0 && "octal escape: unimp!");
490 break;
491
492 // Otherwise, these are not valid escapes.
493 case '(': case '{': case '[': case '%':
494 // GCC accepts these as extensions. We warn about them as such though.
495 if (!PP.getLangOptions().NoExtensions) {
496 Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape,
497 std::string()+(char)ResultChar);
498 break;
499 }
500 // FALL THROUGH.
501 default:
502 if (isgraph(ThisTokBuf[0])) {
503 Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
504 std::string()+(char)ResultChar);
505 } else {
506 Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
507 "x"+utohexstr(ResultChar));
508 }
509 }
510
511 // Note: our internal rep of wide char tokens is always little-endian.
512 *ResultPtr++ = ResultChar & 0xFF;
513
514 if (AnyWide) {
515 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
516 *ResultPtr++ = ResultChar >> i*8;
517 }
518 }
519 }
520
521 // Add zero terminator.
522 *ResultPtr = 0;
523 if (AnyWide) {
524 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
525 *ResultPtr++ = 0;
526 }
527}
528
529void StringLiteralParser::Diag(SourceLocation Loc, unsigned DiagID,
530 const std::string &M) {
531 PP.Diag(Loc, DiagID, M);
532 hadError = true;
533}
534