Blame - clang/Lex/LiteralSupport.cpp - toolchain/llvm-project

blob: 62d370af2d544c4ee7a13cd81bf32c578b6cff9c [file] [log] [blame]

Steve Naroff	09ef474	2007-03-09 23:16:33 +0000	[diff] [blame]	1	//===--- LiteralSupport.cpp - Code to parse and process literals-- C++ --===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file was developed by Steve Naroff and is distributed under
				6	// the University of Illinois Open Source License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// This file implements the NumericLiteralParser interface.
				11	//
				12	//===----------------------------------------------------------------------===//
				13
				14	#include "clang/Lex/LiteralSupport.h"
				15	#include "clang/Lex/Preprocessor.h"
				16	#include "clang/Basic/TargetInfo.h"
				17	#include "clang/Basic/Diagnostic.h"
Steve Naroff	4f88b31	2007-03-13 22:37:02 +0000	[diff] [blame^]	18	#include "llvm/ADT/StringExtras.h"
Steve Naroff	09ef474	2007-03-09 23:16:33 +0000	[diff] [blame]	19
				20	using namespace llvm;
				21	using namespace clang;
				22
				23	/// integer-constant: [C99 6.4.4.1]
				24	/// decimal-constant integer-suffix
				25	/// octal-constant integer-suffix
				26	/// hexadecimal-constant integer-suffix
				27	/// decimal-constant:
				28	/// nonzero-digit
				29	/// decimal-constant digit
				30	/// octal-constant:
				31	/// 0
				32	/// octal-constant octal-digit
				33	/// hexadecimal-constant:
				34	/// hexadecimal-prefix hexadecimal-digit
				35	/// hexadecimal-constant hexadecimal-digit
				36	/// hexadecimal-prefix: one of
				37	/// 0x 0X
				38	/// integer-suffix:
				39	/// unsigned-suffix [long-suffix]
				40	/// unsigned-suffix [long-long-suffix]
				41	/// long-suffix [unsigned-suffix]
				42	/// long-long-suffix [unsigned-sufix]
				43	/// nonzero-digit:
				44	/// 1 2 3 4 5 6 7 8 9
				45	/// octal-digit:
				46	/// 0 1 2 3 4 5 6 7
				47	/// hexadecimal-digit:
				48	/// 0 1 2 3 4 5 6 7 8 9
				49	/// a b c d e f
				50	/// A B C D E F
				51	/// unsigned-suffix: one of
				52	/// u U
				53	/// long-suffix: one of
				54	/// l L
				55	/// long-long-suffix: one of
				56	/// ll LL
				57	///
				58	/// floating-constant: [C99 6.4.4.2]
				59	/// TODO: add rules...
				60	///
				61
				62	NumericLiteralParser::
				63	NumericLiteralParser(const char begin, const char end,
Steve Naroff	451d8f16	2007-03-12 23:22:38 +0000	[diff] [blame]	64	SourceLocation TokLoc, Preprocessor &pp) :
				65	PP(pp), ThisTokBegin(begin), ThisTokEnd(end)
Steve Naroff	09ef474	2007-03-09 23:16:33 +0000	[diff] [blame]	66	{
				67	s = DigitsBegin = begin;
				68	saw_exponent = false;
				69	saw_period = false;
				70	saw_float_suffix = false;
				71	isLong = false;
				72	isUnsigned = false;
				73	isLongLong = false;
				74	hadError = false;
				75
				76	if (*s == '0') { // parse radix
				77	s++;
				78	if ((s == 'x' \|\| s == 'X') && (isxdigit(s[1]) \|\| s[1] == '.')) {
				79	s++;
				80	radix = 16;
				81	DigitsBegin = s;
				82	s = SkipHexDigits(s);
				83	if (s == ThisTokEnd) {
				84	} else if (*s == '.') {
				85	s++;
				86	saw_period = true;
				87	s = SkipHexDigits(s);
				88	}
				89	// A binary exponent can appear with or with a '.'. If dotted, the
				90	// binary exponent is required.
				91	if (s == 'p' \|\| s == 'P') {
				92	s++;
				93	saw_exponent = true;
				94	if (s == '+' \|\| s == '-') s++; // sign
				95	const char *first_non_digit = SkipDigits(s);
				96	if (first_non_digit == s) {
				97	Diag(TokLoc, diag::err_exponent_has_no_digits);
				98	return;
				99	} else {
				100	s = first_non_digit;
				101	}
				102	} else if (saw_period) {
				103	Diag(TokLoc, diag::err_hexconstant_requires_exponent);
				104	return;
				105	}
				106	} else {
				107	// For now, the radix is set to 8. If we discover that we have a
				108	// floating point constant, the radix will change to 10. Octal floating
				109	// point constants are not permitted (only decimal and hexadecimal).
				110	radix = 8;
				111	DigitsBegin = s;
				112	s = SkipOctalDigits(s);
				113	if (s == ThisTokEnd) {
				114	} else if (*s == '.') {
				115	s++;
				116	radix = 10;
				117	saw_period = true;
				118	s = SkipDigits(s);
				119	}
				120	if (s == 'e' \|\| s == 'E') { // exponent
				121	s++;
				122	radix = 10;
				123	saw_exponent = true;
				124	if (s == '+' \|\| s == '-') s++; // sign
				125	const char *first_non_digit = SkipDigits(s);
				126	if (first_non_digit == s) {
				127	Diag(TokLoc, diag::err_exponent_has_no_digits);
				128	return;
				129	} else {
				130	s = first_non_digit;
				131	}
				132	}
				133	}
				134	} else { // the first digit is non-zero
				135	radix = 10;
				136	s = SkipDigits(s);
				137	if (s == ThisTokEnd) {
				138	} else if (*s == '.') {
				139	s++;
				140	saw_period = true;
				141	s = SkipDigits(s);
				142	}
				143	if (s == 'e' \|\| s == 'E') { // exponent
				144	s++;
				145	saw_exponent = true;
				146	if (s == '+' \|\| s == '-') s++; // sign
				147	const char *first_non_digit = SkipDigits(s);
				148	if (first_non_digit == s) {
				149	Diag(TokLoc, diag::err_exponent_has_no_digits);
				150	return;
				151	} else {
				152	s = first_non_digit;
				153	}
				154	}
				155	}
				156
				157	SuffixBegin = s;
				158
				159	if (saw_period \|\| saw_exponent) {
				160	if (s < ThisTokEnd) { // parse size suffix (float, long double)
				161	if (s == 'f' \|\| s == 'F') {
				162	saw_float_suffix = true;
				163	s++;
				164	} else if (s == 'l' \|\| s == 'L') {
				165	isLong = true;
				166	s++;
				167	}
				168	if (s != ThisTokEnd) {
				169	Diag(TokLoc, diag::err_invalid_suffix_float_constant,
				170	std::string(SuffixBegin, ThisTokEnd));
				171	return;
				172	}
				173	}
				174	} else {
				175	if (s < ThisTokEnd) {
				176	// parse int suffix - they can appear in any order ("ul", "lu", "llu").
				177	if (s == 'u' \|\| s == 'U') {
				178	s++;
				179	isUnsigned = true; // unsigned
				180
				181	if ((s < ThisTokEnd) && (s == 'l' \|\| s == 'L')) {
				182	s++;
				183	// handle "long long" type - l's need to be adjacent and same case.
				184	if ((s < ThisTokEnd) && (s == (s-1))) {
				185	isLongLong = true; // unsigned long long
				186	s++;
				187	} else {
				188	isLong = true; // unsigned long
				189	}
				190	}
				191	} else if (s == 'l' \|\| s == 'L') {
				192	s++;
				193	// handle "long long" types - l's need to be adjacent and same case.
				194	if ((s < ThisTokEnd) && (s == (s-1))) {
				195	s++;
				196	if ((s < ThisTokEnd) && (s == 'u' \|\| s == 'U')) {
				197	isUnsigned = true; // unsigned long long
				198	s++;
				199	} else {
				200	isLongLong = true; // long long
				201	}
				202	} else { // handle "long" types
				203	if ((s < ThisTokEnd) && (s == 'u' \|\| s == 'U')) {
				204	isUnsigned = true; // unsigned long
				205	s++;
				206	} else {
				207	isLong = true; // long
				208	}
				209	}
				210	}
				211	if (s != ThisTokEnd) {
				212	Diag(TokLoc, diag::err_invalid_suffix_integer_constant,
				213	std::string(SuffixBegin, ThisTokEnd));
				214	return;
				215	}
				216	}
				217	}
				218	}
				219
Steve Naroff	451d8f16	2007-03-12 23:22:38 +0000	[diff] [blame]	220	bool NumericLiteralParser::GetIntegerValue(uintmax_t &val) {
Steve Naroff	f2fb89e	2007-03-13 20:29:44 +0000	[diff] [blame]	221	uintmax_t max_value = UINTMAX_MAX / radix;
				222	int max_digit = UINTMAX_MAX % radix;
Steve Naroff	09ef474	2007-03-09 23:16:33 +0000	[diff] [blame]	223	char c;
				224
				225	val = 0;
Steve Naroff	451d8f16	2007-03-12 23:22:38 +0000	[diff] [blame]	226	s = DigitsBegin;
				227	while (s < SuffixBegin) {
				228	c = *s++;
				229	if (c >= '0' && c <= '9')
				230	c -= '0';
Steve Naroff	f2fb89e	2007-03-13 20:29:44 +0000	[diff] [blame]	231	else if (c >= 'A' && c <= 'F')
				232	c -= 'A' - 10;
				233	else if (c >= 'a' && c <= 'f')
Steve Naroff	451d8f16	2007-03-12 23:22:38 +0000	[diff] [blame]	234	c -= 'a' - 10;
				235
Steve Naroff	f2fb89e	2007-03-13 20:29:44 +0000	[diff] [blame]	236	if (val > max_value \|\| (val == max_value && c > max_digit)) {
Steve Naroff	451d8f16	2007-03-12 23:22:38 +0000	[diff] [blame]	237	return false; // Overflow!
				238	} else {
				239	val *= radix;
				240	val += c;
				241	}
				242	}
				243	return true;
				244	}
				245
				246	bool NumericLiteralParser::GetIntegerValue(int &val) {
Steve Naroff	f2fb89e	2007-03-13 20:29:44 +0000	[diff] [blame]	247	intmax_t max_value = INT_MAX / radix;
				248	int max_digit = INT_MAX % radix;
Steve Naroff	451d8f16	2007-03-12 23:22:38 +0000	[diff] [blame]	249	char c;
				250
				251	val = 0;
Steve Naroff	09ef474	2007-03-09 23:16:33 +0000	[diff] [blame]	252	s = DigitsBegin;
				253	while (s < SuffixBegin) {
				254	c = *s++;
				255	if (c >= '0' && c <= '9')
				256	c -= '0';
Steve Naroff	f2fb89e	2007-03-13 20:29:44 +0000	[diff] [blame]	257	else if (c >= 'A' && c <= 'F')
Steve Naroff	09ef474	2007-03-09 23:16:33 +0000	[diff] [blame]	258	c -= 'A' - 10;
Steve Naroff	f2fb89e	2007-03-13 20:29:44 +0000	[diff] [blame]	259	else if (c >= 'a' && c <= 'f')
Steve Naroff	09ef474	2007-03-09 23:16:33 +0000	[diff] [blame]	260	c -= 'a' - 10;
				261
Steve Naroff	f2fb89e	2007-03-13 20:29:44 +0000	[diff] [blame]	262	if (val > max_value \|\| (val == max_value && c > max_digit)) {
Steve Naroff	09ef474	2007-03-09 23:16:33 +0000	[diff] [blame]	263	return false; // Overflow!
				264	} else {
				265	val *= radix;
				266	val += c;
				267	}
				268	}
				269	return true;
				270	}
Steve Naroff	f2fb89e	2007-03-13 20:29:44 +0000	[diff] [blame]	271
				272	void NumericLiteralParser::Diag(SourceLocation Loc, unsigned DiagID,
				273	const std::string &M) {
				274	PP.Diag(Loc, DiagID, M);
				275	hadError = true;
				276	}
Steve Naroff	4f88b31	2007-03-13 22:37:02 +0000	[diff] [blame^]	277
				278	/// string-literal: [C99 6.4.5]
				279	/// " [s-char-sequence] "
				280	/// L" [s-char-sequence] "
				281	/// s-char-sequence:
				282	/// s-char
				283	/// s-char-sequence s-char
				284	/// s-char:
				285	/// any source character except the double quote ",
				286	/// backslash \, or newline character
				287	/// escape-character
				288	/// universal-character-name
				289	/// escape-character: [C99 6.4.4.4]
				290	/// \ escape-code
				291	/// universal-character-name
				292	/// escape-code:
				293	/// character-escape-code
				294	/// octal-escape-code
				295	/// hex-escape-code
				296	/// character-escape-code: one of
				297	/// n t b r f v a
				298	/// \ ' " ?
				299	/// octal-escape-code:
				300	/// octal-digit
				301	/// octal-digit octal-digit
				302	/// octal-digit octal-digit octal-digit
				303	/// hex-escape-code:
				304	/// x hex-digit
				305	/// hex-escape-code hex-digit
				306	/// universal-character-name:
				307	/// \u hex-quad
				308	/// \U hex-quad hex-quad
				309	/// hex-quad:
				310	/// hex-digit hex-digit hex-digit hex-digit
				311
				312	StringLiteralParser::
				313	StringLiteralParser(const LexerToken *StringToks, unsigned NumStringToks,
				314	Preprocessor &pp, TargetInfo &t) :
				315	PP(pp), Target(t)
				316	{
				317	// Scan all of the string portions, remember the max individual token length,
				318	// computing a bound on the concatenated string length, and see whether any
				319	// piece is a wide-string. If any of the string portions is a wide-string
				320	// literal, the result is a wide-string literal [C99 6.4.5p4].
				321	MaxTokenLength = StringToks[0].getLength();
				322	SizeBound = StringToks[0].getLength()-2; // -2 for "".
				323	AnyWide = StringToks[0].getKind() == tok::wide_string_literal;
				324
				325	// The common case is that there is only one string fragment.
				326	for (unsigned i = 1; i != NumStringToks; ++i) {
				327	// The string could be shorter than this if it needs cleaning, but this is a
				328	// reasonable bound, which is all we need.
				329	SizeBound += StringToks[i].getLength()-2; // -2 for "".
				330
				331	// Remember maximum string piece length.
				332	if (StringToks[i].getLength() > MaxTokenLength)
				333	MaxTokenLength = StringToks[i].getLength();
				334
				335	// Remember if we see any wide strings.
				336	AnyWide \|= StringToks[i].getKind() == tok::wide_string_literal;
				337	}
				338
				339
				340	// Include space for the null terminator.
				341	++SizeBound;
				342
				343	// TODO: K&R warning: "traditional C rejects string constant concatenation"
				344
				345	// Get the width in bytes of wchar_t. If no wchar_t strings are used, do not
				346	// query the target. As such, wchar_tByteWidth is only valid if AnyWide=true.
				347	wchar_tByteWidth = ~0U;
				348	if (AnyWide)
				349	wchar_tByteWidth = Target.getWCharWidth(StringToks[0].getLocation());
				350
				351	// The output buffer size needs to be large enough to hold wide characters.
				352	// This is a worst-case assumption which basically corresponds to L"" "long".
				353	if (AnyWide)
				354	SizeBound *= wchar_tByteWidth;
				355
				356	// Size the temporary buffer to hold the result string data.
				357	ResultBuf.resize(SizeBound);
				358
				359	// Likewise, but for each string piece.
				360	SmallString<512> TokenBuf;
				361	TokenBuf.resize(MaxTokenLength);
				362
				363	// Loop over all the strings, getting their spelling, and expanding them to
				364	// wide strings as appropriate.
				365	ResultPtr = &ResultBuf[0]; // Next byte to fill in.
				366
				367	for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
				368	const char *ThisTokBuf = &TokenBuf[0];
				369	// Get the spelling of the token, which eliminates trigraphs, etc. We know
				370	// that ThisTokBuf points to a buffer that is big enough for the whole token
				371	// and 'spelled' tokens can only shrink.
				372	unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf);
				373	const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end quote.
				374
				375	// TODO: Input character set mapping support.
				376
				377	// Skip L marker for wide strings.
				378	if (ThisTokBuf[0] == 'L') ++ThisTokBuf;
				379
				380	assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
				381	++ThisTokBuf;
				382
				383	while (ThisTokBuf != ThisTokEnd) {
				384	// Is this a span of non-escape characters?
				385	if (ThisTokBuf[0] != '\\') {
				386	const char *InStart = ThisTokBuf;
				387	do {
				388	++ThisTokBuf;
				389	} while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
				390
				391	// Copy the character span over.
				392	unsigned Len = ThisTokBuf-InStart;
				393	if (!AnyWide) {
				394	memcpy(ResultPtr, InStart, Len);
				395	ResultPtr += Len;
				396	} else {
				397	// Note: our internal rep of wide char tokens is always little-endian.
				398	for (; Len; --Len, ++InStart) {
				399	*ResultPtr++ = InStart[0];
				400	// Add zeros at the end.
				401	for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
				402	*ResultPtr++ = 0;
				403	}
				404	}
				405	continue;
				406	}
				407
				408	// Otherwise, this is an escape character. Skip the '\' char.
				409	++ThisTokBuf;
				410
				411	// We know that this character can't be off the end of the buffer, because
				412	// that would have been \", which would not have been the end of string.
				413	unsigned ResultChar = *ThisTokBuf++;
				414	switch (ResultChar) {
				415	// These map to themselves.
				416	case '\\': case '\'': case '"': case '?': break;
				417
				418	// These have fixed mappings.
				419	case 'a':
				420	// TODO: K&R: the meaning of '\\a' is different in traditional C
				421	ResultChar = 7;
				422	break;
				423	case 'b':
				424	ResultChar = 8;
				425	break;
				426	case 'e':
				427	Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape, "e");
				428	ResultChar = 27;
				429	break;
				430	case 'f':
				431	ResultChar = 12;
				432	break;
				433	case 'n':
				434	ResultChar = 10;
				435	break;
				436	case 'r':
				437	ResultChar = 13;
				438	break;
				439	case 't':
				440	ResultChar = 9;
				441	break;
				442	case 'v':
				443	ResultChar = 11;
				444	break;
				445
				446	//case 'u': case 'U': // FIXME: UCNs.
				447	case 'x': // Hex escape.
				448	if (ThisTokBuf == ThisTokEnd \|\|
				449	(ResultChar = HexDigitValue(*ThisTokBuf)) == ~0U) {
				450	Diag(StringToks[i].getLocation(), diag::err_hex_escape_no_digits);
				451	ResultChar = 0;
				452	break;
				453	}
				454	++ThisTokBuf; // Consumed one hex digit.
				455
				456	assert(0 && "hex escape: unimp!");
				457	break;
				458	case '0': case '1': case '2': case '3':
				459	case '4': case '5': case '6': case '7':
				460	// Octal escapes.
				461	assert(0 && "octal escape: unimp!");
				462	break;
				463
				464	// Otherwise, these are not valid escapes.
				465	case '(': case '{': case '[': case '%':
				466	// GCC accepts these as extensions. We warn about them as such though.
				467	if (!PP.getLangOptions().NoExtensions) {
				468	Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape,
				469	std::string()+(char)ResultChar);
				470	break;
				471	}
				472	// FALL THROUGH.
				473	default:
				474	if (isgraph(ThisTokBuf[0])) {
				475	Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
				476	std::string()+(char)ResultChar);
				477	} else {
				478	Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
				479	"x"+utohexstr(ResultChar));
				480	}
				481	}
				482
				483	// Note: our internal rep of wide char tokens is always little-endian.
				484	*ResultPtr++ = ResultChar & 0xFF;
				485
				486	if (AnyWide) {
				487	for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
				488	ResultPtr++ = ResultChar >> i8;
				489	}
				490	}
				491	}
				492
				493	// Add zero terminator.
				494	*ResultPtr = 0;
				495	if (AnyWide) {
				496	for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
				497	*ResultPtr++ = 0;
				498	}
				499	}
				500
				501	void StringLiteralParser::Diag(SourceLocation Loc, unsigned DiagID,
				502	const std::string &M) {
				503	PP.Diag(Loc, DiagID, M);
				504	hadError = true;
				505	}
				506