Blame - clang/Lex/LiteralSupport.cpp - toolchain/llvm-project

blob: 286bcf6a3b09971a9a7cb931d7c6b4e84c81e9f2 [file] [log] [blame]

Steve Naroff	09ef474	2007-03-09 23:16:33 +0000	[diff] [blame]	1	//===--- LiteralSupport.cpp - Code to parse and process literals-- C++ --===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file was developed by Steve Naroff and is distributed under
				6	// the University of Illinois Open Source License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// This file implements the NumericLiteralParser interface.
				11	//
				12	//===----------------------------------------------------------------------===//
				13
				14	#include "clang/Lex/LiteralSupport.h"
				15	#include "clang/Lex/Preprocessor.h"
				16	#include "clang/Basic/TargetInfo.h"
				17	#include "clang/Basic/Diagnostic.h"
Chris Lattner	5b743d3	2007-04-04 05:52:58 +0000	[diff] [blame^]	18	#include "llvm/ADT/APInt.h"
Steve Naroff	4f88b31	2007-03-13 22:37:02 +0000	[diff] [blame]	19	#include "llvm/ADT/StringExtras.h"
Steve Naroff	09ef474	2007-03-09 23:16:33 +0000	[diff] [blame]	20	using namespace llvm;
				21	using namespace clang;
				22
				23	/// integer-constant: [C99 6.4.4.1]
				24	/// decimal-constant integer-suffix
				25	/// octal-constant integer-suffix
				26	/// hexadecimal-constant integer-suffix
				27	/// decimal-constant:
				28	/// nonzero-digit
				29	/// decimal-constant digit
				30	/// octal-constant:
				31	/// 0
				32	/// octal-constant octal-digit
				33	/// hexadecimal-constant:
				34	/// hexadecimal-prefix hexadecimal-digit
				35	/// hexadecimal-constant hexadecimal-digit
				36	/// hexadecimal-prefix: one of
				37	/// 0x 0X
				38	/// integer-suffix:
				39	/// unsigned-suffix [long-suffix]
				40	/// unsigned-suffix [long-long-suffix]
				41	/// long-suffix [unsigned-suffix]
				42	/// long-long-suffix [unsigned-sufix]
				43	/// nonzero-digit:
				44	/// 1 2 3 4 5 6 7 8 9
				45	/// octal-digit:
				46	/// 0 1 2 3 4 5 6 7
				47	/// hexadecimal-digit:
				48	/// 0 1 2 3 4 5 6 7 8 9
				49	/// a b c d e f
				50	/// A B C D E F
				51	/// unsigned-suffix: one of
				52	/// u U
				53	/// long-suffix: one of
				54	/// l L
				55	/// long-long-suffix: one of
				56	/// ll LL
				57	///
				58	/// floating-constant: [C99 6.4.4.2]
				59	/// TODO: add rules...
				60	///
				61
				62	NumericLiteralParser::
				63	NumericLiteralParser(const char begin, const char end,
Steve Naroff	451d8f16	2007-03-12 23:22:38 +0000	[diff] [blame]	64	SourceLocation TokLoc, Preprocessor &pp) :
				65	PP(pp), ThisTokBegin(begin), ThisTokEnd(end)
Steve Naroff	09ef474	2007-03-09 23:16:33 +0000	[diff] [blame]	66	{
				67	s = DigitsBegin = begin;
				68	saw_exponent = false;
				69	saw_period = false;
				70	saw_float_suffix = false;
				71	isLong = false;
				72	isUnsigned = false;
				73	isLongLong = false;
				74	hadError = false;
				75
				76	if (*s == '0') { // parse radix
				77	s++;
				78	if ((s == 'x' \|\| s == 'X') && (isxdigit(s[1]) \|\| s[1] == '.')) {
				79	s++;
				80	radix = 16;
				81	DigitsBegin = s;
				82	s = SkipHexDigits(s);
				83	if (s == ThisTokEnd) {
				84	} else if (*s == '.') {
				85	s++;
				86	saw_period = true;
				87	s = SkipHexDigits(s);
				88	}
				89	// A binary exponent can appear with or with a '.'. If dotted, the
				90	// binary exponent is required.
				91	if (s == 'p' \|\| s == 'P') {
				92	s++;
				93	saw_exponent = true;
				94	if (s == '+' \|\| s == '-') s++; // sign
				95	const char *first_non_digit = SkipDigits(s);
				96	if (first_non_digit == s) {
				97	Diag(TokLoc, diag::err_exponent_has_no_digits);
				98	return;
				99	} else {
				100	s = first_non_digit;
				101	}
				102	} else if (saw_period) {
				103	Diag(TokLoc, diag::err_hexconstant_requires_exponent);
				104	return;
				105	}
				106	} else {
				107	// For now, the radix is set to 8. If we discover that we have a
				108	// floating point constant, the radix will change to 10. Octal floating
				109	// point constants are not permitted (only decimal and hexadecimal).
				110	radix = 8;
				111	DigitsBegin = s;
				112	s = SkipOctalDigits(s);
				113	if (s == ThisTokEnd) {
				114	} else if (*s == '.') {
				115	s++;
				116	radix = 10;
				117	saw_period = true;
				118	s = SkipDigits(s);
				119	}
				120	if (s == 'e' \|\| s == 'E') { // exponent
				121	s++;
				122	radix = 10;
				123	saw_exponent = true;
				124	if (s == '+' \|\| s == '-') s++; // sign
				125	const char *first_non_digit = SkipDigits(s);
				126	if (first_non_digit == s) {
				127	Diag(TokLoc, diag::err_exponent_has_no_digits);
				128	return;
				129	} else {
				130	s = first_non_digit;
				131	}
				132	}
				133	}
				134	} else { // the first digit is non-zero
				135	radix = 10;
				136	s = SkipDigits(s);
				137	if (s == ThisTokEnd) {
				138	} else if (*s == '.') {
				139	s++;
				140	saw_period = true;
				141	s = SkipDigits(s);
				142	}
				143	if (s == 'e' \|\| s == 'E') { // exponent
				144	s++;
				145	saw_exponent = true;
				146	if (s == '+' \|\| s == '-') s++; // sign
				147	const char *first_non_digit = SkipDigits(s);
				148	if (first_non_digit == s) {
				149	Diag(TokLoc, diag::err_exponent_has_no_digits);
				150	return;
				151	} else {
				152	s = first_non_digit;
				153	}
				154	}
				155	}
				156
				157	SuffixBegin = s;
				158
				159	if (saw_period \|\| saw_exponent) {
				160	if (s < ThisTokEnd) { // parse size suffix (float, long double)
				161	if (s == 'f' \|\| s == 'F') {
				162	saw_float_suffix = true;
				163	s++;
				164	} else if (s == 'l' \|\| s == 'L') {
				165	isLong = true;
				166	s++;
				167	}
				168	if (s != ThisTokEnd) {
				169	Diag(TokLoc, diag::err_invalid_suffix_float_constant,
				170	std::string(SuffixBegin, ThisTokEnd));
				171	return;
				172	}
				173	}
				174	} else {
				175	if (s < ThisTokEnd) {
				176	// parse int suffix - they can appear in any order ("ul", "lu", "llu").
				177	if (s == 'u' \|\| s == 'U') {
				178	s++;
				179	isUnsigned = true; // unsigned
				180
				181	if ((s < ThisTokEnd) && (s == 'l' \|\| s == 'L')) {
				182	s++;
				183	// handle "long long" type - l's need to be adjacent and same case.
				184	if ((s < ThisTokEnd) && (s == (s-1))) {
				185	isLongLong = true; // unsigned long long
				186	s++;
				187	} else {
				188	isLong = true; // unsigned long
				189	}
				190	}
				191	} else if (s == 'l' \|\| s == 'L') {
				192	s++;
				193	// handle "long long" types - l's need to be adjacent and same case.
				194	if ((s < ThisTokEnd) && (s == (s-1))) {
				195	s++;
				196	if ((s < ThisTokEnd) && (s == 'u' \|\| s == 'U')) {
				197	isUnsigned = true; // unsigned long long
				198	s++;
				199	} else {
				200	isLongLong = true; // long long
				201	}
				202	} else { // handle "long" types
				203	if ((s < ThisTokEnd) && (s == 'u' \|\| s == 'U')) {
				204	isUnsigned = true; // unsigned long
				205	s++;
				206	} else {
				207	isLong = true; // long
				208	}
				209	}
				210	}
				211	if (s != ThisTokEnd) {
				212	Diag(TokLoc, diag::err_invalid_suffix_integer_constant,
				213	std::string(SuffixBegin, ThisTokEnd));
				214	return;
				215	}
				216	}
				217	}
				218	}
				219
Chris Lattner	5b743d3	2007-04-04 05:52:58 +0000	[diff] [blame^]	220	static unsigned HexLetterToVal(char c) {
				221	if (c >= '0' && c <= '9')
				222	return c - '0';
				223	else if (c >= 'A' && c <= 'F')
				224	return c - 'A' - 10;
				225	else
				226	assert(c >= 'a' && c <= 'f' && "Lexer scanning error");
				227	return c - 'a' - 10;
				228	}
				229
Steve Naroff	451d8f16	2007-03-12 23:22:38 +0000	[diff] [blame]	230	bool NumericLiteralParser::GetIntegerValue(uintmax_t &val) {
Steve Naroff	f2fb89e	2007-03-13 20:29:44 +0000	[diff] [blame]	231	uintmax_t max_value = UINTMAX_MAX / radix;
Chris Lattner	5b743d3	2007-04-04 05:52:58 +0000	[diff] [blame^]	232	unsigned max_digit = UINTMAX_MAX % radix;
Steve Naroff	09ef474	2007-03-09 23:16:33 +0000	[diff] [blame]	233
				234	val = 0;
Steve Naroff	451d8f16	2007-03-12 23:22:38 +0000	[diff] [blame]	235	s = DigitsBegin;
				236	while (s < SuffixBegin) {
Chris Lattner	5b743d3	2007-04-04 05:52:58 +0000	[diff] [blame^]	237	unsigned C = HexLetterToVal(*s++);
Steve Naroff	451d8f16	2007-03-12 23:22:38 +0000	[diff] [blame]	238
Chris Lattner	5b743d3	2007-04-04 05:52:58 +0000	[diff] [blame^]	239	if (val > max_value \|\| (val == max_value && C > max_digit)) {
Steve Naroff	451d8f16	2007-03-12 23:22:38 +0000	[diff] [blame]	240	return false; // Overflow!
				241	} else {
				242	val *= radix;
Chris Lattner	5b743d3	2007-04-04 05:52:58 +0000	[diff] [blame^]	243	val += C;
Steve Naroff	451d8f16	2007-03-12 23:22:38 +0000	[diff] [blame]	244	}
				245	}
				246	return true;
				247	}
				248
				249	bool NumericLiteralParser::GetIntegerValue(int &val) {
Steve Naroff	f2fb89e	2007-03-13 20:29:44 +0000	[diff] [blame]	250	intmax_t max_value = INT_MAX / radix;
Chris Lattner	5b743d3	2007-04-04 05:52:58 +0000	[diff] [blame^]	251	unsigned max_digit = INT_MAX % radix;
Steve Naroff	451d8f16	2007-03-12 23:22:38 +0000	[diff] [blame]	252
				253	val = 0;
Steve Naroff	09ef474	2007-03-09 23:16:33 +0000	[diff] [blame]	254	s = DigitsBegin;
				255	while (s < SuffixBegin) {
Chris Lattner	5b743d3	2007-04-04 05:52:58 +0000	[diff] [blame^]	256	unsigned C = HexLetterToVal(*s++);
Steve Naroff	09ef474	2007-03-09 23:16:33 +0000	[diff] [blame]	257
Chris Lattner	5b743d3	2007-04-04 05:52:58 +0000	[diff] [blame^]	258	if (val > max_value \|\| (val == max_value && C > max_digit)) {
Steve Naroff	09ef474	2007-03-09 23:16:33 +0000	[diff] [blame]	259	return false; // Overflow!
				260	} else {
				261	val *= radix;
Chris Lattner	5b743d3	2007-04-04 05:52:58 +0000	[diff] [blame^]	262	val += C;
Steve Naroff	09ef474	2007-03-09 23:16:33 +0000	[diff] [blame]	263	}
				264	}
				265	return true;
				266	}
Steve Naroff	f2fb89e	2007-03-13 20:29:44 +0000	[diff] [blame]	267
Chris Lattner	5b743d3	2007-04-04 05:52:58 +0000	[diff] [blame^]	268	/// GetIntegerValue - Convert this numeric literal value to an APInt that
				269	/// matches Val's input width. If there is an overflow, saturate Val to zero
				270	/// and return false. Otherwise, set Val and return true.
				271	bool NumericLiteralParser::GetIntegerValue(APInt &Val) {
				272	Val = 0;
				273	s = DigitsBegin;
				274
				275	// FIXME: This doesn't handle sign right, doesn't autopromote to wider
				276	// integer, and is generally not conformant.
				277	APInt RadixVal(Val.getBitWidth(), radix);
				278	APInt CharVal(Val.getBitWidth(), 0);
				279	APInt OldVal = Val;
				280	while (s < SuffixBegin) {
				281	unsigned C = HexLetterToVal(*s++);
				282
				283	// If this letter is out of bound for this radix, reject it.
				284	if (C >= radix) { Val = 0; return false; }
				285
				286	CharVal = C;
				287
				288	OldVal = Val;
				289	Val *= RadixVal;
				290	Val += CharVal;
				291	if (OldVal.ugt(Val))
				292	return false; // Overflow!
				293	}
				294	return true;
				295	}
				296
				297
Steve Naroff	f2fb89e	2007-03-13 20:29:44 +0000	[diff] [blame]	298	void NumericLiteralParser::Diag(SourceLocation Loc, unsigned DiagID,
				299	const std::string &M) {
				300	PP.Diag(Loc, DiagID, M);
				301	hadError = true;
				302	}
Steve Naroff	4f88b31	2007-03-13 22:37:02 +0000	[diff] [blame]	303
				304	/// string-literal: [C99 6.4.5]
				305	/// " [s-char-sequence] "
				306	/// L" [s-char-sequence] "
				307	/// s-char-sequence:
				308	/// s-char
				309	/// s-char-sequence s-char
				310	/// s-char:
				311	/// any source character except the double quote ",
				312	/// backslash \, or newline character
				313	/// escape-character
				314	/// universal-character-name
				315	/// escape-character: [C99 6.4.4.4]
				316	/// \ escape-code
				317	/// universal-character-name
				318	/// escape-code:
				319	/// character-escape-code
				320	/// octal-escape-code
				321	/// hex-escape-code
				322	/// character-escape-code: one of
				323	/// n t b r f v a
				324	/// \ ' " ?
				325	/// octal-escape-code:
				326	/// octal-digit
				327	/// octal-digit octal-digit
				328	/// octal-digit octal-digit octal-digit
				329	/// hex-escape-code:
				330	/// x hex-digit
				331	/// hex-escape-code hex-digit
				332	/// universal-character-name:
				333	/// \u hex-quad
				334	/// \U hex-quad hex-quad
				335	/// hex-quad:
				336	/// hex-digit hex-digit hex-digit hex-digit
				337
				338	StringLiteralParser::
				339	StringLiteralParser(const LexerToken *StringToks, unsigned NumStringToks,
				340	Preprocessor &pp, TargetInfo &t) :
				341	PP(pp), Target(t)
				342	{
				343	// Scan all of the string portions, remember the max individual token length,
				344	// computing a bound on the concatenated string length, and see whether any
				345	// piece is a wide-string. If any of the string portions is a wide-string
				346	// literal, the result is a wide-string literal [C99 6.4.5p4].
				347	MaxTokenLength = StringToks[0].getLength();
				348	SizeBound = StringToks[0].getLength()-2; // -2 for "".
				349	AnyWide = StringToks[0].getKind() == tok::wide_string_literal;
				350
Steve Naroff	f1e5369	2007-03-23 22:27:02 +0000	[diff] [blame]	351	hadError = false;
				352
Steve Naroff	4f88b31	2007-03-13 22:37:02 +0000	[diff] [blame]	353	// The common case is that there is only one string fragment.
				354	for (unsigned i = 1; i != NumStringToks; ++i) {
				355	// The string could be shorter than this if it needs cleaning, but this is a
				356	// reasonable bound, which is all we need.
				357	SizeBound += StringToks[i].getLength()-2; // -2 for "".
				358
				359	// Remember maximum string piece length.
				360	if (StringToks[i].getLength() > MaxTokenLength)
				361	MaxTokenLength = StringToks[i].getLength();
				362
				363	// Remember if we see any wide strings.
				364	AnyWide \|= StringToks[i].getKind() == tok::wide_string_literal;
				365	}
				366
				367
				368	// Include space for the null terminator.
				369	++SizeBound;
				370
				371	// TODO: K&R warning: "traditional C rejects string constant concatenation"
				372
				373	// Get the width in bytes of wchar_t. If no wchar_t strings are used, do not
				374	// query the target. As such, wchar_tByteWidth is only valid if AnyWide=true.
				375	wchar_tByteWidth = ~0U;
				376	if (AnyWide)
				377	wchar_tByteWidth = Target.getWCharWidth(StringToks[0].getLocation());
				378
				379	// The output buffer size needs to be large enough to hold wide characters.
				380	// This is a worst-case assumption which basically corresponds to L"" "long".
				381	if (AnyWide)
				382	SizeBound *= wchar_tByteWidth;
				383
				384	// Size the temporary buffer to hold the result string data.
				385	ResultBuf.resize(SizeBound);
				386
				387	// Likewise, but for each string piece.
				388	SmallString<512> TokenBuf;
				389	TokenBuf.resize(MaxTokenLength);
				390
				391	// Loop over all the strings, getting their spelling, and expanding them to
				392	// wide strings as appropriate.
				393	ResultPtr = &ResultBuf[0]; // Next byte to fill in.
				394
				395	for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
				396	const char *ThisTokBuf = &TokenBuf[0];
				397	// Get the spelling of the token, which eliminates trigraphs, etc. We know
				398	// that ThisTokBuf points to a buffer that is big enough for the whole token
				399	// and 'spelled' tokens can only shrink.
				400	unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf);
				401	const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end quote.
				402
				403	// TODO: Input character set mapping support.
				404
				405	// Skip L marker for wide strings.
				406	if (ThisTokBuf[0] == 'L') ++ThisTokBuf;
				407
				408	assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
				409	++ThisTokBuf;
				410
				411	while (ThisTokBuf != ThisTokEnd) {
				412	// Is this a span of non-escape characters?
				413	if (ThisTokBuf[0] != '\\') {
				414	const char *InStart = ThisTokBuf;
				415	do {
				416	++ThisTokBuf;
				417	} while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
				418
				419	// Copy the character span over.
				420	unsigned Len = ThisTokBuf-InStart;
				421	if (!AnyWide) {
				422	memcpy(ResultPtr, InStart, Len);
				423	ResultPtr += Len;
				424	} else {
				425	// Note: our internal rep of wide char tokens is always little-endian.
				426	for (; Len; --Len, ++InStart) {
				427	*ResultPtr++ = InStart[0];
				428	// Add zeros at the end.
				429	for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
				430	*ResultPtr++ = 0;
				431	}
				432	}
				433	continue;
				434	}
				435
				436	// Otherwise, this is an escape character. Skip the '\' char.
				437	++ThisTokBuf;
				438
				439	// We know that this character can't be off the end of the buffer, because
				440	// that would have been \", which would not have been the end of string.
				441	unsigned ResultChar = *ThisTokBuf++;
				442	switch (ResultChar) {
				443	// These map to themselves.
				444	case '\\': case '\'': case '"': case '?': break;
				445
				446	// These have fixed mappings.
				447	case 'a':
				448	// TODO: K&R: the meaning of '\\a' is different in traditional C
				449	ResultChar = 7;
				450	break;
				451	case 'b':
				452	ResultChar = 8;
				453	break;
				454	case 'e':
				455	Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape, "e");
				456	ResultChar = 27;
				457	break;
				458	case 'f':
				459	ResultChar = 12;
				460	break;
				461	case 'n':
				462	ResultChar = 10;
				463	break;
				464	case 'r':
				465	ResultChar = 13;
				466	break;
				467	case 't':
				468	ResultChar = 9;
				469	break;
				470	case 'v':
				471	ResultChar = 11;
				472	break;
				473
				474	//case 'u': case 'U': // FIXME: UCNs.
				475	case 'x': // Hex escape.
				476	if (ThisTokBuf == ThisTokEnd \|\|
				477	(ResultChar = HexDigitValue(*ThisTokBuf)) == ~0U) {
				478	Diag(StringToks[i].getLocation(), diag::err_hex_escape_no_digits);
				479	ResultChar = 0;
				480	break;
				481	}
				482	++ThisTokBuf; // Consumed one hex digit.
				483
				484	assert(0 && "hex escape: unimp!");
				485	break;
				486	case '0': case '1': case '2': case '3':
				487	case '4': case '5': case '6': case '7':
				488	// Octal escapes.
				489	assert(0 && "octal escape: unimp!");
				490	break;
				491
				492	// Otherwise, these are not valid escapes.
				493	case '(': case '{': case '[': case '%':
				494	// GCC accepts these as extensions. We warn about them as such though.
				495	if (!PP.getLangOptions().NoExtensions) {
				496	Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape,
				497	std::string()+(char)ResultChar);
				498	break;
				499	}
				500	// FALL THROUGH.
				501	default:
				502	if (isgraph(ThisTokBuf[0])) {
				503	Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
				504	std::string()+(char)ResultChar);
				505	} else {
				506	Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
				507	"x"+utohexstr(ResultChar));
				508	}
				509	}
				510
				511	// Note: our internal rep of wide char tokens is always little-endian.
				512	*ResultPtr++ = ResultChar & 0xFF;
				513
				514	if (AnyWide) {
				515	for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
				516	ResultPtr++ = ResultChar >> i8;
				517	}
				518	}
				519	}
				520
				521	// Add zero terminator.
				522	*ResultPtr = 0;
				523	if (AnyWide) {
				524	for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
				525	*ResultPtr++ = 0;
				526	}
				527	}
				528
				529	void StringLiteralParser::Diag(SourceLocation Loc, unsigned DiagID,
				530	const std::string &M) {
				531	PP.Diag(Loc, DiagID, M);
				532	hadError = true;
				533	}
				534