blob: 62d370af2d544c4ee7a13cd81bf32c578b6cff9c [file] [log] [blame]
Steve Naroff09ef4742007-03-09 23:16:33 +00001//===--- LiteralSupport.cpp - Code to parse and process literals-*- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file was developed by Steve Naroff and is distributed under
6// the University of Illinois Open Source License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements the NumericLiteralParser interface.
11//
12//===----------------------------------------------------------------------===//
13
14#include "clang/Lex/LiteralSupport.h"
15#include "clang/Lex/Preprocessor.h"
16#include "clang/Basic/TargetInfo.h"
17#include "clang/Basic/Diagnostic.h"
Steve Naroff4f88b312007-03-13 22:37:02 +000018#include "llvm/ADT/StringExtras.h"
Steve Naroff09ef4742007-03-09 23:16:33 +000019
20using namespace llvm;
21using namespace clang;
22
23/// integer-constant: [C99 6.4.4.1]
24/// decimal-constant integer-suffix
25/// octal-constant integer-suffix
26/// hexadecimal-constant integer-suffix
27/// decimal-constant:
28/// nonzero-digit
29/// decimal-constant digit
30/// octal-constant:
31/// 0
32/// octal-constant octal-digit
33/// hexadecimal-constant:
34/// hexadecimal-prefix hexadecimal-digit
35/// hexadecimal-constant hexadecimal-digit
36/// hexadecimal-prefix: one of
37/// 0x 0X
38/// integer-suffix:
39/// unsigned-suffix [long-suffix]
40/// unsigned-suffix [long-long-suffix]
41/// long-suffix [unsigned-suffix]
42/// long-long-suffix [unsigned-sufix]
43/// nonzero-digit:
44/// 1 2 3 4 5 6 7 8 9
45/// octal-digit:
46/// 0 1 2 3 4 5 6 7
47/// hexadecimal-digit:
48/// 0 1 2 3 4 5 6 7 8 9
49/// a b c d e f
50/// A B C D E F
51/// unsigned-suffix: one of
52/// u U
53/// long-suffix: one of
54/// l L
55/// long-long-suffix: one of
56/// ll LL
57///
58/// floating-constant: [C99 6.4.4.2]
59/// TODO: add rules...
60///
61
62NumericLiteralParser::
63NumericLiteralParser(const char *begin, const char *end,
Steve Naroff451d8f162007-03-12 23:22:38 +000064 SourceLocation TokLoc, Preprocessor &pp) :
65 PP(pp), ThisTokBegin(begin), ThisTokEnd(end)
Steve Naroff09ef4742007-03-09 23:16:33 +000066{
67 s = DigitsBegin = begin;
68 saw_exponent = false;
69 saw_period = false;
70 saw_float_suffix = false;
71 isLong = false;
72 isUnsigned = false;
73 isLongLong = false;
74 hadError = false;
75
76 if (*s == '0') { // parse radix
77 s++;
78 if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) {
79 s++;
80 radix = 16;
81 DigitsBegin = s;
82 s = SkipHexDigits(s);
83 if (s == ThisTokEnd) {
84 } else if (*s == '.') {
85 s++;
86 saw_period = true;
87 s = SkipHexDigits(s);
88 }
89 // A binary exponent can appear with or with a '.'. If dotted, the
90 // binary exponent is required.
91 if (*s == 'p' || *s == 'P') {
92 s++;
93 saw_exponent = true;
94 if (*s == '+' || *s == '-') s++; // sign
95 const char *first_non_digit = SkipDigits(s);
96 if (first_non_digit == s) {
97 Diag(TokLoc, diag::err_exponent_has_no_digits);
98 return;
99 } else {
100 s = first_non_digit;
101 }
102 } else if (saw_period) {
103 Diag(TokLoc, diag::err_hexconstant_requires_exponent);
104 return;
105 }
106 } else {
107 // For now, the radix is set to 8. If we discover that we have a
108 // floating point constant, the radix will change to 10. Octal floating
109 // point constants are not permitted (only decimal and hexadecimal).
110 radix = 8;
111 DigitsBegin = s;
112 s = SkipOctalDigits(s);
113 if (s == ThisTokEnd) {
114 } else if (*s == '.') {
115 s++;
116 radix = 10;
117 saw_period = true;
118 s = SkipDigits(s);
119 }
120 if (*s == 'e' || *s == 'E') { // exponent
121 s++;
122 radix = 10;
123 saw_exponent = true;
124 if (*s == '+' || *s == '-') s++; // sign
125 const char *first_non_digit = SkipDigits(s);
126 if (first_non_digit == s) {
127 Diag(TokLoc, diag::err_exponent_has_no_digits);
128 return;
129 } else {
130 s = first_non_digit;
131 }
132 }
133 }
134 } else { // the first digit is non-zero
135 radix = 10;
136 s = SkipDigits(s);
137 if (s == ThisTokEnd) {
138 } else if (*s == '.') {
139 s++;
140 saw_period = true;
141 s = SkipDigits(s);
142 }
143 if (*s == 'e' || *s == 'E') { // exponent
144 s++;
145 saw_exponent = true;
146 if (*s == '+' || *s == '-') s++; // sign
147 const char *first_non_digit = SkipDigits(s);
148 if (first_non_digit == s) {
149 Diag(TokLoc, diag::err_exponent_has_no_digits);
150 return;
151 } else {
152 s = first_non_digit;
153 }
154 }
155 }
156
157 SuffixBegin = s;
158
159 if (saw_period || saw_exponent) {
160 if (s < ThisTokEnd) { // parse size suffix (float, long double)
161 if (*s == 'f' || *s == 'F') {
162 saw_float_suffix = true;
163 s++;
164 } else if (*s == 'l' || *s == 'L') {
165 isLong = true;
166 s++;
167 }
168 if (s != ThisTokEnd) {
169 Diag(TokLoc, diag::err_invalid_suffix_float_constant,
170 std::string(SuffixBegin, ThisTokEnd));
171 return;
172 }
173 }
174 } else {
175 if (s < ThisTokEnd) {
176 // parse int suffix - they can appear in any order ("ul", "lu", "llu").
177 if (*s == 'u' || *s == 'U') {
178 s++;
179 isUnsigned = true; // unsigned
180
181 if ((s < ThisTokEnd) && (*s == 'l' || *s == 'L')) {
182 s++;
183 // handle "long long" type - l's need to be adjacent and same case.
184 if ((s < ThisTokEnd) && (*s == *(s-1))) {
185 isLongLong = true; // unsigned long long
186 s++;
187 } else {
188 isLong = true; // unsigned long
189 }
190 }
191 } else if (*s == 'l' || *s == 'L') {
192 s++;
193 // handle "long long" types - l's need to be adjacent and same case.
194 if ((s < ThisTokEnd) && (*s == *(s-1))) {
195 s++;
196 if ((s < ThisTokEnd) && (*s == 'u' || *s == 'U')) {
197 isUnsigned = true; // unsigned long long
198 s++;
199 } else {
200 isLongLong = true; // long long
201 }
202 } else { // handle "long" types
203 if ((s < ThisTokEnd) && (*s == 'u' || *s == 'U')) {
204 isUnsigned = true; // unsigned long
205 s++;
206 } else {
207 isLong = true; // long
208 }
209 }
210 }
211 if (s != ThisTokEnd) {
212 Diag(TokLoc, diag::err_invalid_suffix_integer_constant,
213 std::string(SuffixBegin, ThisTokEnd));
214 return;
215 }
216 }
217 }
218}
219
Steve Naroff451d8f162007-03-12 23:22:38 +0000220bool NumericLiteralParser::GetIntegerValue(uintmax_t &val) {
Steve Narofff2fb89e2007-03-13 20:29:44 +0000221 uintmax_t max_value = UINTMAX_MAX / radix;
222 int max_digit = UINTMAX_MAX % radix;
Steve Naroff09ef4742007-03-09 23:16:33 +0000223 char c;
224
225 val = 0;
Steve Naroff451d8f162007-03-12 23:22:38 +0000226 s = DigitsBegin;
227 while (s < SuffixBegin) {
228 c = *s++;
229 if (c >= '0' && c <= '9')
230 c -= '0';
Steve Narofff2fb89e2007-03-13 20:29:44 +0000231 else if (c >= 'A' && c <= 'F')
232 c -= 'A' - 10;
233 else if (c >= 'a' && c <= 'f')
Steve Naroff451d8f162007-03-12 23:22:38 +0000234 c -= 'a' - 10;
235
Steve Narofff2fb89e2007-03-13 20:29:44 +0000236 if (val > max_value || (val == max_value && c > max_digit)) {
Steve Naroff451d8f162007-03-12 23:22:38 +0000237 return false; // Overflow!
238 } else {
239 val *= radix;
240 val += c;
241 }
242 }
243 return true;
244}
245
246bool NumericLiteralParser::GetIntegerValue(int &val) {
Steve Narofff2fb89e2007-03-13 20:29:44 +0000247 intmax_t max_value = INT_MAX / radix;
248 int max_digit = INT_MAX % radix;
Steve Naroff451d8f162007-03-12 23:22:38 +0000249 char c;
250
251 val = 0;
Steve Naroff09ef4742007-03-09 23:16:33 +0000252 s = DigitsBegin;
253 while (s < SuffixBegin) {
254 c = *s++;
255 if (c >= '0' && c <= '9')
256 c -= '0';
Steve Narofff2fb89e2007-03-13 20:29:44 +0000257 else if (c >= 'A' && c <= 'F')
Steve Naroff09ef4742007-03-09 23:16:33 +0000258 c -= 'A' - 10;
Steve Narofff2fb89e2007-03-13 20:29:44 +0000259 else if (c >= 'a' && c <= 'f')
Steve Naroff09ef4742007-03-09 23:16:33 +0000260 c -= 'a' - 10;
261
Steve Narofff2fb89e2007-03-13 20:29:44 +0000262 if (val > max_value || (val == max_value && c > max_digit)) {
Steve Naroff09ef4742007-03-09 23:16:33 +0000263 return false; // Overflow!
264 } else {
265 val *= radix;
266 val += c;
267 }
268 }
269 return true;
270}
Steve Narofff2fb89e2007-03-13 20:29:44 +0000271
272void NumericLiteralParser::Diag(SourceLocation Loc, unsigned DiagID,
273 const std::string &M) {
274 PP.Diag(Loc, DiagID, M);
275 hadError = true;
276}
Steve Naroff4f88b312007-03-13 22:37:02 +0000277
278/// string-literal: [C99 6.4.5]
279/// " [s-char-sequence] "
280/// L" [s-char-sequence] "
281/// s-char-sequence:
282/// s-char
283/// s-char-sequence s-char
284/// s-char:
285/// any source character except the double quote ",
286/// backslash \, or newline character
287/// escape-character
288/// universal-character-name
289/// escape-character: [C99 6.4.4.4]
290/// \ escape-code
291/// universal-character-name
292/// escape-code:
293/// character-escape-code
294/// octal-escape-code
295/// hex-escape-code
296/// character-escape-code: one of
297/// n t b r f v a
298/// \ ' " ?
299/// octal-escape-code:
300/// octal-digit
301/// octal-digit octal-digit
302/// octal-digit octal-digit octal-digit
303/// hex-escape-code:
304/// x hex-digit
305/// hex-escape-code hex-digit
306/// universal-character-name:
307/// \u hex-quad
308/// \U hex-quad hex-quad
309/// hex-quad:
310/// hex-digit hex-digit hex-digit hex-digit
311
312StringLiteralParser::
313StringLiteralParser(const LexerToken *StringToks, unsigned NumStringToks,
314 Preprocessor &pp, TargetInfo &t) :
315 PP(pp), Target(t)
316{
317 // Scan all of the string portions, remember the max individual token length,
318 // computing a bound on the concatenated string length, and see whether any
319 // piece is a wide-string. If any of the string portions is a wide-string
320 // literal, the result is a wide-string literal [C99 6.4.5p4].
321 MaxTokenLength = StringToks[0].getLength();
322 SizeBound = StringToks[0].getLength()-2; // -2 for "".
323 AnyWide = StringToks[0].getKind() == tok::wide_string_literal;
324
325 // The common case is that there is only one string fragment.
326 for (unsigned i = 1; i != NumStringToks; ++i) {
327 // The string could be shorter than this if it needs cleaning, but this is a
328 // reasonable bound, which is all we need.
329 SizeBound += StringToks[i].getLength()-2; // -2 for "".
330
331 // Remember maximum string piece length.
332 if (StringToks[i].getLength() > MaxTokenLength)
333 MaxTokenLength = StringToks[i].getLength();
334
335 // Remember if we see any wide strings.
336 AnyWide |= StringToks[i].getKind() == tok::wide_string_literal;
337 }
338
339
340 // Include space for the null terminator.
341 ++SizeBound;
342
343 // TODO: K&R warning: "traditional C rejects string constant concatenation"
344
345 // Get the width in bytes of wchar_t. If no wchar_t strings are used, do not
346 // query the target. As such, wchar_tByteWidth is only valid if AnyWide=true.
347 wchar_tByteWidth = ~0U;
348 if (AnyWide)
349 wchar_tByteWidth = Target.getWCharWidth(StringToks[0].getLocation());
350
351 // The output buffer size needs to be large enough to hold wide characters.
352 // This is a worst-case assumption which basically corresponds to L"" "long".
353 if (AnyWide)
354 SizeBound *= wchar_tByteWidth;
355
356 // Size the temporary buffer to hold the result string data.
357 ResultBuf.resize(SizeBound);
358
359 // Likewise, but for each string piece.
360 SmallString<512> TokenBuf;
361 TokenBuf.resize(MaxTokenLength);
362
363 // Loop over all the strings, getting their spelling, and expanding them to
364 // wide strings as appropriate.
365 ResultPtr = &ResultBuf[0]; // Next byte to fill in.
366
367 for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
368 const char *ThisTokBuf = &TokenBuf[0];
369 // Get the spelling of the token, which eliminates trigraphs, etc. We know
370 // that ThisTokBuf points to a buffer that is big enough for the whole token
371 // and 'spelled' tokens can only shrink.
372 unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf);
373 const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end quote.
374
375 // TODO: Input character set mapping support.
376
377 // Skip L marker for wide strings.
378 if (ThisTokBuf[0] == 'L') ++ThisTokBuf;
379
380 assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
381 ++ThisTokBuf;
382
383 while (ThisTokBuf != ThisTokEnd) {
384 // Is this a span of non-escape characters?
385 if (ThisTokBuf[0] != '\\') {
386 const char *InStart = ThisTokBuf;
387 do {
388 ++ThisTokBuf;
389 } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
390
391 // Copy the character span over.
392 unsigned Len = ThisTokBuf-InStart;
393 if (!AnyWide) {
394 memcpy(ResultPtr, InStart, Len);
395 ResultPtr += Len;
396 } else {
397 // Note: our internal rep of wide char tokens is always little-endian.
398 for (; Len; --Len, ++InStart) {
399 *ResultPtr++ = InStart[0];
400 // Add zeros at the end.
401 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
402 *ResultPtr++ = 0;
403 }
404 }
405 continue;
406 }
407
408 // Otherwise, this is an escape character. Skip the '\' char.
409 ++ThisTokBuf;
410
411 // We know that this character can't be off the end of the buffer, because
412 // that would have been \", which would not have been the end of string.
413 unsigned ResultChar = *ThisTokBuf++;
414 switch (ResultChar) {
415 // These map to themselves.
416 case '\\': case '\'': case '"': case '?': break;
417
418 // These have fixed mappings.
419 case 'a':
420 // TODO: K&R: the meaning of '\\a' is different in traditional C
421 ResultChar = 7;
422 break;
423 case 'b':
424 ResultChar = 8;
425 break;
426 case 'e':
427 Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape, "e");
428 ResultChar = 27;
429 break;
430 case 'f':
431 ResultChar = 12;
432 break;
433 case 'n':
434 ResultChar = 10;
435 break;
436 case 'r':
437 ResultChar = 13;
438 break;
439 case 't':
440 ResultChar = 9;
441 break;
442 case 'v':
443 ResultChar = 11;
444 break;
445
446 //case 'u': case 'U': // FIXME: UCNs.
447 case 'x': // Hex escape.
448 if (ThisTokBuf == ThisTokEnd ||
449 (ResultChar = HexDigitValue(*ThisTokBuf)) == ~0U) {
450 Diag(StringToks[i].getLocation(), diag::err_hex_escape_no_digits);
451 ResultChar = 0;
452 break;
453 }
454 ++ThisTokBuf; // Consumed one hex digit.
455
456 assert(0 && "hex escape: unimp!");
457 break;
458 case '0': case '1': case '2': case '3':
459 case '4': case '5': case '6': case '7':
460 // Octal escapes.
461 assert(0 && "octal escape: unimp!");
462 break;
463
464 // Otherwise, these are not valid escapes.
465 case '(': case '{': case '[': case '%':
466 // GCC accepts these as extensions. We warn about them as such though.
467 if (!PP.getLangOptions().NoExtensions) {
468 Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape,
469 std::string()+(char)ResultChar);
470 break;
471 }
472 // FALL THROUGH.
473 default:
474 if (isgraph(ThisTokBuf[0])) {
475 Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
476 std::string()+(char)ResultChar);
477 } else {
478 Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
479 "x"+utohexstr(ResultChar));
480 }
481 }
482
483 // Note: our internal rep of wide char tokens is always little-endian.
484 *ResultPtr++ = ResultChar & 0xFF;
485
486 if (AnyWide) {
487 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
488 *ResultPtr++ = ResultChar >> i*8;
489 }
490 }
491 }
492
493 // Add zero terminator.
494 *ResultPtr = 0;
495 if (AnyWide) {
496 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
497 *ResultPtr++ = 0;
498 }
499}
500
501void StringLiteralParser::Diag(SourceLocation Loc, unsigned DiagID,
502 const std::string &M) {
503 PP.Diag(Loc, DiagID, M);
504 hadError = true;
505}
506