Blame - lib/Lex/LiteralSupport.cpp - fp2-dev/platform/external/clang

blob: b2290b3187c6c43969a7a0c29fb5c40be9c4ddea [file] [log] [blame]

Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1	//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
Chris Lattner	0bc735f	2007-12-29 19:59:25 +0000	[diff] [blame]	5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// This file implements the NumericLiteralParser, CharLiteralParser, and
				11	// StringLiteralParser interfaces.
				12	//
				13	//===----------------------------------------------------------------------===//
				14
				15	#include "clang/Lex/LiteralSupport.h"
				16	#include "clang/Lex/Preprocessor.h"
Chris Lattner	500d329	2009-01-29 05:15:15 +0000	[diff] [blame]	17	#include "clang/Lex/LexDiagnostic.h"
Chris Lattner	136f93a	2007-07-16 06:55:01 +0000	[diff] [blame]	18	#include "clang/Basic/TargetInfo.h"
Eli Friedman	f74a458	2011-11-01 02:14:50 +0000	[diff] [blame]	19	#include "clang/Basic/ConvertUTF.h"
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	20	#include "llvm/ADT/StringExtras.h"
David Blaikie	9fe8c74	2011-09-23 05:35:21 +0000	[diff] [blame]	21	#include "llvm/Support/ErrorHandling.h"
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	22	using namespace clang;
				23
				24	/// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
				25	/// not valid.
				26	static int HexDigitValue(char C) {
				27	if (C >= '0' && C <= '9') return C-'0';
				28	if (C >= 'a' && C <= 'f') return C-'a'+10;
				29	if (C >= 'A' && C <= 'F') return C-'A'+10;
				30	return -1;
				31	}
				32
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	33	static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
				34	switch (kind) {
David Blaikie	b219cfc	2011-09-23 05:06:16 +0000	[diff] [blame]	35	default: llvm_unreachable("Unknown token type!");
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	36	case tok::char_constant:
				37	case tok::string_literal:
				38	case tok::utf8_string_literal:
				39	return Target.getCharWidth();
				40	case tok::wide_char_constant:
				41	case tok::wide_string_literal:
				42	return Target.getWCharWidth();
				43	case tok::utf16_char_constant:
				44	case tok::utf16_string_literal:
				45	return Target.getChar16Width();
				46	case tok::utf32_char_constant:
				47	case tok::utf32_string_literal:
				48	return Target.getChar32Width();
				49	}
				50	}
				51
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	52	/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
				53	/// either a character or a string literal.
				54	static unsigned ProcessCharEscape(const char *&ThisTokBuf,
				55	const char *ThisTokEnd, bool &HadError,
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	56	FullSourceLoc Loc, unsigned CharWidth,
David Blaikie	d6471f7	2011-09-25 23:23:43 +0000	[diff] [blame]	57	DiagnosticsEngine *Diags) {
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	58	// Skip the '\' char.
				59	++ThisTokBuf;
				60
				61	// We know that this character can't be off the end of the buffer, because
				62	// that would have been \", which would not have been the end of string.
				63	unsigned ResultChar = *ThisTokBuf++;
				64	switch (ResultChar) {
				65	// These map to themselves.
				66	case '\\': case '\'': case '"': case '?': break;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	67
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	68	// These have fixed mappings.
				69	case 'a':
				70	// TODO: K&R: the meaning of '\\a' is different in traditional C
				71	ResultChar = 7;
				72	break;
				73	case 'b':
				74	ResultChar = 8;
				75	break;
				76	case 'e':
Chris Lattner	91f54ce	2010-11-17 06:26:08 +0000	[diff] [blame]	77	if (Diags)
				78	Diags->Report(Loc, diag::ext_nonstandard_escape) << "e";
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	79	ResultChar = 27;
				80	break;
Eli Friedman	3c54801	2009-06-10 01:32:39 +0000	[diff] [blame]	81	case 'E':
Chris Lattner	91f54ce	2010-11-17 06:26:08 +0000	[diff] [blame]	82	if (Diags)
				83	Diags->Report(Loc, diag::ext_nonstandard_escape) << "E";
Eli Friedman	3c54801	2009-06-10 01:32:39 +0000	[diff] [blame]	84	ResultChar = 27;
				85	break;
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	86	case 'f':
				87	ResultChar = 12;
				88	break;
				89	case 'n':
				90	ResultChar = 10;
				91	break;
				92	case 'r':
				93	ResultChar = 13;
				94	break;
				95	case 't':
				96	ResultChar = 9;
				97	break;
				98	case 'v':
				99	ResultChar = 11;
				100	break;
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	101	case 'x': { // Hex escape.
				102	ResultChar = 0;
				103	if (ThisTokBuf == ThisTokEnd \|\| !isxdigit(*ThisTokBuf)) {
Chris Lattner	91f54ce	2010-11-17 06:26:08 +0000	[diff] [blame]	104	if (Diags)
				105	Diags->Report(Loc, diag::err_hex_escape_no_digits);
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	106	HadError = 1;
				107	break;
				108	}
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	109
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	110	// Hex escapes are a maximal series of hex digits.
				111	bool Overflow = false;
				112	for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
				113	int CharVal = HexDigitValue(ThisTokBuf[0]);
				114	if (CharVal == -1) break;
Chris Lattner	c29bbde	2008-09-30 20:45:40 +0000	[diff] [blame]	115	// About to shift out a digit?
				116	Overflow \|= (ResultChar & 0xF0000000) ? true : false;
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	117	ResultChar <<= 4;
				118	ResultChar \|= CharVal;
				119	}
				120
				121	// See if any bits will be truncated when evaluated as a character.
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	122	if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
				123	Overflow = true;
				124	ResultChar &= ~0U >> (32-CharWidth);
				125	}
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	126
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	127	// Check for overflow.
Chris Lattner	91f54ce	2010-11-17 06:26:08 +0000	[diff] [blame]	128	if (Overflow && Diags) // Too many digits to fit in
				129	Diags->Report(Loc, diag::warn_hex_escape_too_large);
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	130	break;
				131	}
				132	case '0': case '1': case '2': case '3':
				133	case '4': case '5': case '6': case '7': {
				134	// Octal escapes.
				135	--ThisTokBuf;
				136	ResultChar = 0;
				137
				138	// Octal escapes are a series of octal digits with maximum length 3.
				139	// "\0123" is a two digit sequence equal to "\012" "3".
				140	unsigned NumDigits = 0;
				141	do {
				142	ResultChar <<= 3;
				143	ResultChar \|= *ThisTokBuf++ - '0';
				144	++NumDigits;
				145	} while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
				146	ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	147
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	148	// Check for overflow. Reject '\777', but not L'\777'.
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	149	if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
Chris Lattner	91f54ce	2010-11-17 06:26:08 +0000	[diff] [blame]	150	if (Diags)
				151	Diags->Report(Loc, diag::warn_octal_escape_too_large);
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	152	ResultChar &= ~0U >> (32-CharWidth);
				153	}
				154	break;
				155	}
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	156
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	157	// Otherwise, these are not valid escapes.
				158	case '(': case '{': case '[': case '%':
				159	// GCC accepts these as extensions. We warn about them as such though.
Chris Lattner	91f54ce	2010-11-17 06:26:08 +0000	[diff] [blame]	160	if (Diags)
				161	Diags->Report(Loc, diag::ext_nonstandard_escape)
Douglas Gregor	b90f4b3	2010-05-26 05:35:51 +0000	[diff] [blame]	162	<< std::string()+(char)ResultChar;
Eli Friedman	f01fdff	2009-04-28 00:51:18 +0000	[diff] [blame]	163	break;
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	164	default:
Chris Lattner	91f54ce	2010-11-17 06:26:08 +0000	[diff] [blame]	165	if (Diags == 0)
Douglas Gregor	b90f4b3	2010-05-26 05:35:51 +0000	[diff] [blame]	166	break;
				167
Ted Kremenek	23ef69d	2010-12-03 00:09:56 +0000	[diff] [blame]	168	if (isgraph(ResultChar))
Chris Lattner	91f54ce	2010-11-17 06:26:08 +0000	[diff] [blame]	169	Diags->Report(Loc, diag::ext_unknown_escape)
				170	<< std::string()+(char)ResultChar;
Chris Lattner	ac92d82	2008-11-22 07:23:31 +0000	[diff] [blame]	171	else
Chris Lattner	91f54ce	2010-11-17 06:26:08 +0000	[diff] [blame]	172	Diags->Report(Loc, diag::ext_unknown_escape)
				173	<< "x"+llvm::utohexstr(ResultChar);
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	174	break;
				175	}
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	176
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	177	return ResultChar;
				178	}
				179
Steve Naroff	0e3e3eb	2009-03-30 23:46:03 +0000	[diff] [blame]	180	/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
Nico Weber	59705ae	2010-10-09 00:27:47 +0000	[diff] [blame]	181	/// return the UTF32.
Richard Smith	26b75c0	2012-03-09 22:27:51 +0000	[diff] [blame]	182	static bool ProcessUCNEscape(const char ThisTokBegin, const char &ThisTokBuf,
				183	const char *ThisTokEnd,
Nico Weber	59705ae	2010-10-09 00:27:47 +0000	[diff] [blame]	184	uint32_t &UcnVal, unsigned short &UcnLen,
David Blaikie	d6471f7	2011-09-25 23:23:43 +0000	[diff] [blame]	185	FullSourceLoc Loc, DiagnosticsEngine *Diags,
Seth Cantrell	be77352	2012-01-18 12:27:04 +0000	[diff] [blame]	186	const LangOptions &Features,
				187	bool in_char_string_literal = false) {
Chris Lattner	6c66f07	2010-11-17 06:46:14 +0000	[diff] [blame]	188	if (!Features.CPlusPlus && !Features.C99 && Diags)
Chris Lattner	872a45e	2010-11-17 06:55:10 +0000	[diff] [blame]	189	Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	190
Richard Smith	26b75c0	2012-03-09 22:27:51 +0000	[diff] [blame]	191	const char *UcnBegin = ThisTokBuf;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	192
Steve Naroff	0e3e3eb	2009-03-30 23:46:03 +0000	[diff] [blame]	193	// Skip the '\u' char's.
				194	ThisTokBuf += 2;
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	195
Steve Naroff	0e3e3eb	2009-03-30 23:46:03 +0000	[diff] [blame]	196	if (ThisTokBuf == ThisTokEnd \|\| !isxdigit(*ThisTokBuf)) {
Chris Lattner	6c66f07	2010-11-17 06:46:14 +0000	[diff] [blame]	197	if (Diags)
Chris Lattner	872a45e	2010-11-17 06:55:10 +0000	[diff] [blame]	198	Diags->Report(Loc, diag::err_ucn_escape_no_digits);
Nico Weber	59705ae	2010-10-09 00:27:47 +0000	[diff] [blame]	199	return false;
Steve Naroff	0e3e3eb	2009-03-30 23:46:03 +0000	[diff] [blame]	200	}
Nico Weber	59705ae	2010-10-09 00:27:47 +0000	[diff] [blame]	201	UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
Fariborz Jahanian	56bedef	2010-08-31 23:34:27 +0000	[diff] [blame]	202	unsigned short UcnLenSave = UcnLen;
Nico Weber	59705ae	2010-10-09 00:27:47 +0000	[diff] [blame]	203	for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
Steve Naroff	0e3e3eb	2009-03-30 23:46:03 +0000	[diff] [blame]	204	int CharVal = HexDigitValue(ThisTokBuf[0]);
				205	if (CharVal == -1) break;
				206	UcnVal <<= 4;
				207	UcnVal \|= CharVal;
				208	}
				209	// If we didn't consume the proper number of digits, there is a problem.
Nico Weber	59705ae	2010-10-09 00:27:47 +0000	[diff] [blame]	210	if (UcnLenSave) {
Chris Lattner	872a45e	2010-11-17 06:55:10 +0000	[diff] [blame]	211	if (Diags) {
Chris Lattner	7ef5c27	2010-11-17 07:05:50 +0000	[diff] [blame]	212	SourceLocation L =
Richard Smith	26b75c0	2012-03-09 22:27:51 +0000	[diff] [blame]	213	Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin,
Chris Lattner	7ef5c27	2010-11-17 07:05:50 +0000	[diff] [blame]	214	Loc.getManager(), Features);
Richard Smith	26b75c0	2012-03-09 22:27:51 +0000	[diff] [blame]	215	Diags->Report(L, diag::err_ucn_escape_incomplete);
Chris Lattner	872a45e	2010-11-17 06:55:10 +0000	[diff] [blame]	216	}
Nico Weber	59705ae	2010-10-09 00:27:47 +0000	[diff] [blame]	217	return false;
Steve Naroff	0e3e3eb	2009-03-30 23:46:03 +0000	[diff] [blame]	218	}
Richard Smith	26b75c0	2012-03-09 22:27:51 +0000	[diff] [blame]	219
Seth Cantrell	be77352	2012-01-18 12:27:04 +0000	[diff] [blame]	220	// Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
Richard Smith	26b75c0	2012-03-09 22:27:51 +0000	[diff] [blame]	221	if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) \|\| // surrogate codepoints
				222	UcnVal > 0x10FFFF) { // maximum legal UTF32 value
Chris Lattner	6c66f07	2010-11-17 06:46:14 +0000	[diff] [blame]	223	if (Diags)
Chris Lattner	872a45e	2010-11-17 06:55:10 +0000	[diff] [blame]	224	Diags->Report(Loc, diag::err_ucn_escape_invalid);
Nico Weber	59705ae	2010-10-09 00:27:47 +0000	[diff] [blame]	225	return false;
				226	}
Richard Smith	26b75c0	2012-03-09 22:27:51 +0000	[diff] [blame]	227
				228	// C++11 allows UCNs that refer to control characters and basic source
				229	// characters inside character and string literals
				230	if (UcnVal < 0xa0 &&
				231	(UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { // $, @, `
				232	bool IsError = (!Features.CPlusPlus0x \|\| !in_char_string_literal);
				233	if (Diags) {
				234	SourceLocation UcnBeginLoc =
				235	Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin,
				236	Loc.getManager(), Features);
				237	char BasicSCSChar = UcnVal;
				238	if (UcnVal >= 0x20 && UcnVal < 0x7f)
				239	Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_escape_basic_scs :
				240	diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
				241	<< StringRef(&BasicSCSChar, 1);
				242	else
				243	Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_control_character :
				244	diag::warn_cxx98_compat_literal_ucn_control_character);
				245	}
				246	if (IsError)
				247	return false;
				248	}
				249
Nico Weber	59705ae	2010-10-09 00:27:47 +0000	[diff] [blame]	250	return true;
				251	}
				252
Richard Smith	df9ef1b	2012-06-13 05:37:23 +0000	[diff] [blame]	253	/// MeasureUCNEscape - Determine the number of bytes within the resulting string
				254	/// which this UCN will occupy.
				255	static int MeasureUCNEscape(const char ThisTokBegin, const char &ThisTokBuf,
				256	const char *ThisTokEnd, unsigned CharByteWidth,
				257	const LangOptions &Features, bool &HadError) {
				258	// UTF-32: 4 bytes per escape.
				259	if (CharByteWidth == 4)
				260	return 4;
				261
				262	uint32_t UcnVal = 0;
				263	unsigned short UcnLen = 0;
				264	FullSourceLoc Loc;
				265
				266	if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
				267	UcnLen, Loc, 0, Features, true)) {
				268	HadError = true;
				269	return 0;
				270	}
				271
				272	// UTF-16: 2 bytes for BMP, 4 bytes otherwise.
				273	if (CharByteWidth == 2)
				274	return UcnVal <= 0xFFFF ? 2 : 4;
				275
				276	// UTF-8.
				277	if (UcnVal < 0x80)
				278	return 1;
				279	if (UcnVal < 0x800)
				280	return 2;
				281	if (UcnVal < 0x10000)
				282	return 3;
				283	return 4;
				284	}
				285
Nico Weber	59705ae	2010-10-09 00:27:47 +0000	[diff] [blame]	286	/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
				287	/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
				288	/// StringLiteralParser. When we decide to implement UCN's for identifiers,
				289	/// we will likely rework our support for UCN's.
Richard Smith	26b75c0	2012-03-09 22:27:51 +0000	[diff] [blame]	290	static void EncodeUCNEscape(const char ThisTokBegin, const char &ThisTokBuf,
				291	const char *ThisTokEnd,
Chris Lattner	a95880d	2010-11-17 07:12:42 +0000	[diff] [blame]	292	char *&ResultBuf, bool &HadError,
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	293	FullSourceLoc Loc, unsigned CharByteWidth,
David Blaikie	d6471f7	2011-09-25 23:23:43 +0000	[diff] [blame]	294	DiagnosticsEngine *Diags,
				295	const LangOptions &Features) {
Nico Weber	59705ae	2010-10-09 00:27:47 +0000	[diff] [blame]	296	typedef uint32_t UTF32;
				297	UTF32 UcnVal = 0;
				298	unsigned short UcnLen = 0;
Richard Smith	26b75c0	2012-03-09 22:27:51 +0000	[diff] [blame]	299	if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
				300	Loc, Diags, Features, true)) {
Richard Smith	df9ef1b	2012-06-13 05:37:23 +0000	[diff] [blame]	301	HadError = true;
Steve Naroff	0e3e3eb	2009-03-30 23:46:03 +0000	[diff] [blame]	302	return;
				303	}
Nico Weber	59705ae	2010-10-09 00:27:47 +0000	[diff] [blame]	304
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	305	assert((CharByteWidth == 1 \|\| CharByteWidth == 2 \|\| CharByteWidth) &&
				306	"only character widths of 1, 2, or 4 bytes supported");
Nico Weber	a0f15b0	2010-10-06 04:57:26 +0000	[diff] [blame]	307
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	308	(void)UcnLen;
				309	assert((UcnLen== 4 \|\| UcnLen== 8) && "only ucn length of 4 or 8 supported");
Nico Weber	a0f15b0	2010-10-06 04:57:26 +0000	[diff] [blame]	310
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	311	if (CharByteWidth == 4) {
Eli Friedman	caf1f26	2011-11-02 23:06:23 +0000	[diff] [blame]	312	// FIXME: Make the type of the result buffer correct instead of
				313	// using reinterpret_cast.
				314	UTF32 ResultPtr = reinterpret_cast<UTF32>(ResultBuf);
				315	*ResultPtr = UcnVal;
				316	ResultBuf += 4;
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	317	return;
				318	}
				319
				320	if (CharByteWidth == 2) {
Eli Friedman	caf1f26	2011-11-02 23:06:23 +0000	[diff] [blame]	321	// FIXME: Make the type of the result buffer correct instead of
				322	// using reinterpret_cast.
				323	UTF16 ResultPtr = reinterpret_cast<UTF16>(ResultBuf);
				324
Richard Smith	59b26d8	2012-06-13 05:41:29 +0000	[diff] [blame]	325	if (UcnVal <= (UTF32)0xFFFF) {
Eli Friedman	caf1f26	2011-11-02 23:06:23 +0000	[diff] [blame]	326	*ResultPtr = UcnVal;
				327	ResultBuf += 2;
Nico Weber	a0f15b0	2010-10-06 04:57:26 +0000	[diff] [blame]	328	return;
				329	}
Nico Weber	a0f15b0	2010-10-06 04:57:26 +0000	[diff] [blame]	330
Eli Friedman	caf1f26	2011-11-02 23:06:23 +0000	[diff] [blame]	331	// Convert to UTF16.
Nico Weber	a0f15b0	2010-10-06 04:57:26 +0000	[diff] [blame]	332	UcnVal -= 0x10000;
Eli Friedman	caf1f26	2011-11-02 23:06:23 +0000	[diff] [blame]	333	*ResultPtr = 0xD800 + (UcnVal >> 10);
				334	*(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
				335	ResultBuf += 4;
Fariborz Jahanian	56bedef	2010-08-31 23:34:27 +0000	[diff] [blame]	336	return;
				337	}
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	338
				339	assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
				340
Steve Naroff	0e3e3eb	2009-03-30 23:46:03 +0000	[diff] [blame]	341	// Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
				342	// The conversion below was inspired by:
				343	// http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	344	// First, we determine how many bytes the result will require.
Steve Naroff	4e93b34	2009-04-01 11:09:15 +0000	[diff] [blame]	345	typedef uint8_t UTF8;
Steve Naroff	0e3e3eb	2009-03-30 23:46:03 +0000	[diff] [blame]	346
				347	unsigned short bytesToWrite = 0;
				348	if (UcnVal < (UTF32)0x80)
				349	bytesToWrite = 1;
				350	else if (UcnVal < (UTF32)0x800)
				351	bytesToWrite = 2;
				352	else if (UcnVal < (UTF32)0x10000)
				353	bytesToWrite = 3;
				354	else
				355	bytesToWrite = 4;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	356
Steve Naroff	0e3e3eb	2009-03-30 23:46:03 +0000	[diff] [blame]	357	const unsigned byteMask = 0xBF;
				358	const unsigned byteMark = 0x80;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	359
Steve Naroff	0e3e3eb	2009-03-30 23:46:03 +0000	[diff] [blame]	360	// Once the bits are split out into bytes of UTF8, this is a mask OR-ed
Steve Naroff	8a5c0cd	2009-03-31 10:29:45 +0000	[diff] [blame]	361	// into the first byte, depending on how many bytes follow.
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	362	static const UTF8 firstByteMark[5] = {
Steve Naroff	8a5c0cd	2009-03-31 10:29:45 +0000	[diff] [blame]	363	0x00, 0x00, 0xC0, 0xE0, 0xF0
Steve Naroff	0e3e3eb	2009-03-30 23:46:03 +0000	[diff] [blame]	364	};
				365	// Finally, we write the bytes into ResultBuf.
				366	ResultBuf += bytesToWrite;
				367	switch (bytesToWrite) { // note: everything falls through.
				368	case 4: *--ResultBuf = (UTF8)((UcnVal \| byteMark) & byteMask); UcnVal >>= 6;
				369	case 3: *--ResultBuf = (UTF8)((UcnVal \| byteMark) & byteMask); UcnVal >>= 6;
				370	case 2: *--ResultBuf = (UTF8)((UcnVal \| byteMark) & byteMask); UcnVal >>= 6;
				371	case 1: *--ResultBuf = (UTF8) (UcnVal \| firstByteMark[bytesToWrite]);
				372	}
				373	// Update the buffer.
				374	ResultBuf += bytesToWrite;
				375	}
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	376
				377
				378	/// integer-constant: [C99 6.4.4.1]
				379	/// decimal-constant integer-suffix
				380	/// octal-constant integer-suffix
				381	/// hexadecimal-constant integer-suffix
Richard Smith	49d5174	2012-03-08 21:59:28 +0000	[diff] [blame]	382	/// user-defined-integer-literal: [C++11 lex.ext]
Richard Smith	b453ad3	2012-03-08 08:45:32 +0000	[diff] [blame]	383	/// decimal-literal ud-suffix
				384	/// octal-literal ud-suffix
				385	/// hexadecimal-literal ud-suffix
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	386	/// decimal-constant:
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	387	/// nonzero-digit
				388	/// decimal-constant digit
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	389	/// octal-constant:
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	390	/// 0
				391	/// octal-constant octal-digit
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	392	/// hexadecimal-constant:
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	393	/// hexadecimal-prefix hexadecimal-digit
				394	/// hexadecimal-constant hexadecimal-digit
				395	/// hexadecimal-prefix: one of
				396	/// 0x 0X
				397	/// integer-suffix:
				398	/// unsigned-suffix [long-suffix]
				399	/// unsigned-suffix [long-long-suffix]
				400	/// long-suffix [unsigned-suffix]
				401	/// long-long-suffix [unsigned-sufix]
				402	/// nonzero-digit:
				403	/// 1 2 3 4 5 6 7 8 9
				404	/// octal-digit:
				405	/// 0 1 2 3 4 5 6 7
				406	/// hexadecimal-digit:
				407	/// 0 1 2 3 4 5 6 7 8 9
				408	/// a b c d e f
				409	/// A B C D E F
				410	/// unsigned-suffix: one of
				411	/// u U
				412	/// long-suffix: one of
				413	/// l L
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	414	/// long-long-suffix: one of
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	415	/// ll LL
				416	///
				417	/// floating-constant: [C99 6.4.4.2]
				418	/// TODO: add rules...
				419	///
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	420	NumericLiteralParser::
				421	NumericLiteralParser(const char begin, const char end,
				422	SourceLocation TokLoc, Preprocessor &pp)
				423	: PP(pp), ThisTokBegin(begin), ThisTokEnd(end) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	424
Chris Lattner	c29bbde	2008-09-30 20:45:40 +0000	[diff] [blame]	425	// This routine assumes that the range begin/end matches the regex for integer
				426	// and FP constants (specifically, the 'pp-number' regex), and assumes that
				427	// the byte at "*end" is both valid and not part of the regex. Because of
				428	// this, it doesn't have to check for 'overscan' in various places.
				429	assert(!isalnum(end) && end != '.' && *end != '_' &&
				430	"Lexer didn't maximally munch?");
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	431
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	432	s = DigitsBegin = begin;
				433	saw_exponent = false;
				434	saw_period = false;
Richard Smith	b453ad3	2012-03-08 08:45:32 +0000	[diff] [blame]	435	saw_ud_suffix = false;
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	436	isLong = false;
				437	isUnsigned = false;
				438	isLongLong = false;
Chris Lattner	6e400c2	2007-08-26 03:29:23 +0000	[diff] [blame]	439	isFloat = false;
Chris Lattner	506b8de	2007-08-26 01:58:14 +0000	[diff] [blame]	440	isImaginary = false;
Mike Stump	b79fe2d	2009-10-08 22:55:36 +0000	[diff] [blame]	441	isMicrosoftInteger = false;
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	442	hadError = false;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	443
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	444	if (*s == '0') { // parse radix
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	445	ParseNumberStartingWithZero(TokLoc);
				446	if (hadError)
				447	return;
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	448	} else { // the first digit is non-zero
				449	radix = 10;
				450	s = SkipDigits(s);
				451	if (s == ThisTokEnd) {
				452	// Done.
Christopher Lamb	016765e	2007-11-29 06:06:27 +0000	[diff] [blame]	453	} else if (isxdigit(s) && !(s == 'e' \|\| *s == 'E')) {
Chris Lattner	ac92d82	2008-11-22 07:23:31 +0000	[diff] [blame]	454	PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
Chris Lattner	5f9e272	2011-07-23 10:55:15 +0000	[diff] [blame]	455	diag::err_invalid_decimal_digit) << StringRef(s, 1);
Chris Lattner	ac92d82	2008-11-22 07:23:31 +0000	[diff] [blame]	456	hadError = true;
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	457	return;
				458	} else if (*s == '.') {
				459	s++;
				460	saw_period = true;
				461	s = SkipDigits(s);
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	462	}
Chris Lattner	4411f46	2008-09-29 23:12:31 +0000	[diff] [blame]	463	if ((s == 'e' \|\| s == 'E')) { // exponent
Chris Lattner	70f66ab	2008-04-20 18:47:55 +0000	[diff] [blame]	464	const char *Exponent = s;
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	465	s++;
				466	saw_exponent = true;
				467	if (s == '+' \|\| s == '-') s++; // sign
				468	const char *first_non_digit = SkipDigits(s);
Chris Lattner	0b7f69d	2008-04-20 18:41:46 +0000	[diff] [blame]	469	if (first_non_digit != s) {
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	470	s = first_non_digit;
Chris Lattner	0b7f69d	2008-04-20 18:41:46 +0000	[diff] [blame]	471	} else {
Chris Lattner	ac92d82	2008-11-22 07:23:31 +0000	[diff] [blame]	472	PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-begin),
				473	diag::err_exponent_has_no_digits);
				474	hadError = true;
Chris Lattner	0b7f69d	2008-04-20 18:41:46 +0000	[diff] [blame]	475	return;
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	476	}
				477	}
				478	}
				479
				480	SuffixBegin = s;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	481
Chris Lattner	506b8de	2007-08-26 01:58:14 +0000	[diff] [blame]	482	// Parse the suffix. At this point we can classify whether we have an FP or
				483	// integer constant.
				484	bool isFPConstant = isFloatingLiteral();
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	485
Chris Lattner	506b8de	2007-08-26 01:58:14 +0000	[diff] [blame]	486	// Loop over all of the characters of the suffix. If we see something bad,
				487	// we break out of the loop.
				488	for (; s != ThisTokEnd; ++s) {
				489	switch (*s) {
				490	case 'f': // FP Suffix for "float"
				491	case 'F':
				492	if (!isFPConstant) break; // Error for integer constant.
Chris Lattner	6e400c2	2007-08-26 03:29:23 +0000	[diff] [blame]	493	if (isFloat \|\| isLong) break; // FF, LF invalid.
				494	isFloat = true;
Chris Lattner	506b8de	2007-08-26 01:58:14 +0000	[diff] [blame]	495	continue; // Success.
				496	case 'u':
				497	case 'U':
				498	if (isFPConstant) break; // Error for floating constant.
				499	if (isUnsigned) break; // Cannot be repeated.
				500	isUnsigned = true;
				501	continue; // Success.
				502	case 'l':
				503	case 'L':
				504	if (isLong \|\| isLongLong) break; // Cannot be repeated.
Chris Lattner	6e400c2	2007-08-26 03:29:23 +0000	[diff] [blame]	505	if (isFloat) break; // LF invalid.
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	506
Chris Lattner	506b8de	2007-08-26 01:58:14 +0000	[diff] [blame]	507	// Check for long long. The L's need to be adjacent and the same case.
				508	if (s+1 != ThisTokEnd && s[1] == s[0]) {
				509	if (isFPConstant) break; // long long invalid for floats.
				510	isLongLong = true;
				511	++s; // Eat both of them.
				512	} else {
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	513	isLong = true;
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	514	}
Chris Lattner	506b8de	2007-08-26 01:58:14 +0000	[diff] [blame]	515	continue; // Success.
				516	case 'i':
Chris Lattner	c637415	2010-10-14 00:24:10 +0000	[diff] [blame]	517	case 'I':
David Blaikie	4e4d084	2012-03-11 07:00:24 +0000	[diff] [blame]	518	if (PP.getLangOpts().MicrosoftExt) {
Fariborz Jahanian	a8be02b	2010-01-22 21:36:53 +0000	[diff] [blame]	519	if (isFPConstant \|\| isLong \|\| isLongLong) break;
Nuno Lopes	6e8c7ac	2009-11-28 13:37:52 +0000	[diff] [blame]	520
Steve Naroff	0c29b22	2008-04-04 21:02:54 +0000	[diff] [blame]	521	// Allow i8, i16, i32, i64, and i128.
Mike Stump	b79fe2d	2009-10-08 22:55:36 +0000	[diff] [blame]	522	if (s + 1 != ThisTokEnd) {
				523	switch (s[1]) {
				524	case '8':
				525	s += 2; // i8 suffix
				526	isMicrosoftInteger = true;
Nuno Lopes	6e8c7ac	2009-11-28 13:37:52 +0000	[diff] [blame]	527	break;
Mike Stump	b79fe2d	2009-10-08 22:55:36 +0000	[diff] [blame]	528	case '1':
Nuno Lopes	6e8c7ac	2009-11-28 13:37:52 +0000	[diff] [blame]	529	if (s + 2 == ThisTokEnd) break;
Francois Pichet	d062b60	2011-01-11 11:57:53 +0000	[diff] [blame]	530	if (s[2] == '6') {
				531	s += 3; // i16 suffix
				532	isMicrosoftInteger = true;
				533	}
Nuno Lopes	6e8c7ac	2009-11-28 13:37:52 +0000	[diff] [blame]	534	else if (s[2] == '2') {
				535	if (s + 3 == ThisTokEnd) break;
Francois Pichet	d062b60	2011-01-11 11:57:53 +0000	[diff] [blame]	536	if (s[3] == '8') {
				537	s += 4; // i128 suffix
				538	isMicrosoftInteger = true;
				539	}
Mike Stump	b79fe2d	2009-10-08 22:55:36 +0000	[diff] [blame]	540	}
Nuno Lopes	6e8c7ac	2009-11-28 13:37:52 +0000	[diff] [blame]	541	break;
Mike Stump	b79fe2d	2009-10-08 22:55:36 +0000	[diff] [blame]	542	case '3':
Nuno Lopes	6e8c7ac	2009-11-28 13:37:52 +0000	[diff] [blame]	543	if (s + 2 == ThisTokEnd) break;
Francois Pichet	d062b60	2011-01-11 11:57:53 +0000	[diff] [blame]	544	if (s[2] == '2') {
				545	s += 3; // i32 suffix
				546	isLong = true;
				547	isMicrosoftInteger = true;
				548	}
Nuno Lopes	6e8c7ac	2009-11-28 13:37:52 +0000	[diff] [blame]	549	break;
Mike Stump	b79fe2d	2009-10-08 22:55:36 +0000	[diff] [blame]	550	case '6':
Nuno Lopes	6e8c7ac	2009-11-28 13:37:52 +0000	[diff] [blame]	551	if (s + 2 == ThisTokEnd) break;
Francois Pichet	d062b60	2011-01-11 11:57:53 +0000	[diff] [blame]	552	if (s[2] == '4') {
				553	s += 3; // i64 suffix
				554	isLongLong = true;
				555	isMicrosoftInteger = true;
				556	}
Nuno Lopes	6e8c7ac	2009-11-28 13:37:52 +0000	[diff] [blame]	557	break;
Mike Stump	b79fe2d	2009-10-08 22:55:36 +0000	[diff] [blame]	558	default:
				559	break;
				560	}
				561	break;
Steve Naroff	0c29b22	2008-04-04 21:02:54 +0000	[diff] [blame]	562	}
Steve Naroff	0c29b22	2008-04-04 21:02:54 +0000	[diff] [blame]	563	}
				564	// fall through.
Chris Lattner	506b8de	2007-08-26 01:58:14 +0000	[diff] [blame]	565	case 'j':
				566	case 'J':
				567	if (isImaginary) break; // Cannot be repeated.
				568	PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
				569	diag::ext_imaginary_constant);
				570	isImaginary = true;
				571	continue; // Success.
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	572	}
Richard Smith	b453ad3	2012-03-08 08:45:32 +0000	[diff] [blame]	573	// If we reached here, there was an error or a ud-suffix.
Chris Lattner	506b8de	2007-08-26 01:58:14 +0000	[diff] [blame]	574	break;
				575	}
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	576
Chris Lattner	506b8de	2007-08-26 01:58:14 +0000	[diff] [blame]	577	if (s != ThisTokEnd) {
David Blaikie	4e4d084	2012-03-11 07:00:24 +0000	[diff] [blame]	578	if (PP.getLangOpts().CPlusPlus0x && s == SuffixBegin && *s == '_') {
Richard Smith	b453ad3	2012-03-08 08:45:32 +0000	[diff] [blame]	579	// We have a ud-suffix! By C++11 [lex.ext]p10, ud-suffixes not starting
				580	// with an '_' are ill-formed.
				581	saw_ud_suffix = true;
				582	return;
				583	}
				584
				585	// Report an error if there are any.
				586	PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin-begin),
Chris Lattner	ac92d82	2008-11-22 07:23:31 +0000	[diff] [blame]	587	isFPConstant ? diag::err_invalid_suffix_float_constant :
				588	diag::err_invalid_suffix_integer_constant)
Chris Lattner	5f9e272	2011-07-23 10:55:15 +0000	[diff] [blame]	589	<< StringRef(SuffixBegin, ThisTokEnd-SuffixBegin);
Chris Lattner	ac92d82	2008-11-22 07:23:31 +0000	[diff] [blame]	590	hadError = true;
Chris Lattner	506b8de	2007-08-26 01:58:14 +0000	[diff] [blame]	591	return;
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	592	}
				593	}
				594
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	595	/// ParseNumberStartingWithZero - This method is called when the first character
				596	/// of the number is found to be a zero. This means it is either an octal
				597	/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	598	/// a floating point number (01239.123e4). Eat the prefix, determining the
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	599	/// radix etc.
				600	void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
				601	assert(s[0] == '0' && "Invalid method call");
				602	s++;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	603
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	604	// Handle a hex number like 0x1234.
				605	if ((s == 'x' \|\| s == 'X') && (isxdigit(s[1]) \|\| s[1] == '.')) {
				606	s++;
				607	radix = 16;
				608	DigitsBegin = s;
				609	s = SkipHexDigits(s);
Aaron Ballman	66b0eba	2012-02-08 13:36:33 +0000	[diff] [blame]	610	bool noSignificand = (s == DigitsBegin);
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	611	if (s == ThisTokEnd) {
				612	// Done.
				613	} else if (*s == '.') {
				614	s++;
				615	saw_period = true;
Aaron Ballman	66b0eba	2012-02-08 13:36:33 +0000	[diff] [blame]	616	const char *floatDigitsBegin = s;
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	617	s = SkipHexDigits(s);
Aaron Ballman	66b0eba	2012-02-08 13:36:33 +0000	[diff] [blame]	618	noSignificand &= (floatDigitsBegin == s);
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	619	}
Aaron Ballman	66b0eba	2012-02-08 13:36:33 +0000	[diff] [blame]	620
				621	if (noSignificand) {
				622	PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), \
				623	diag::err_hexconstant_requires_digits);
				624	hadError = true;
				625	return;
				626	}
				627
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	628	// A binary exponent can appear with or with a '.'. If dotted, the
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	629	// binary exponent is required.
Douglas Gregor	1155c42	2011-08-30 22:40:35 +0000	[diff] [blame]	630	if (s == 'p' \|\| s == 'P') {
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	631	const char *Exponent = s;
				632	s++;
				633	saw_exponent = true;
				634	if (s == '+' \|\| s == '-') s++; // sign
				635	const char *first_non_digit = SkipDigits(s);
Chris Lattner	6ea6238	2008-07-25 18:18:34 +0000	[diff] [blame]	636	if (first_non_digit == s) {
Chris Lattner	ac92d82	2008-11-22 07:23:31 +0000	[diff] [blame]	637	PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
				638	diag::err_exponent_has_no_digits);
				639	hadError = true;
Chris Lattner	6ea6238	2008-07-25 18:18:34 +0000	[diff] [blame]	640	return;
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	641	}
Chris Lattner	6ea6238	2008-07-25 18:18:34 +0000	[diff] [blame]	642	s = first_non_digit;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	643
David Blaikie	4e4d084	2012-03-11 07:00:24 +0000	[diff] [blame]	644	if (!PP.getLangOpts().HexFloats)
Chris Lattner	ac92d82	2008-11-22 07:23:31 +0000	[diff] [blame]	645	PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	646	} else if (saw_period) {
Chris Lattner	ac92d82	2008-11-22 07:23:31 +0000	[diff] [blame]	647	PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
				648	diag::err_hexconstant_requires_exponent);
				649	hadError = true;
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	650	}
				651	return;
				652	}
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	653
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	654	// Handle simple binary numbers 0b01010
				655	if (s == 'b' \|\| s == 'B') {
				656	// 0b101010 is a GCC extension.
Chris Lattner	413d355	2008-06-30 06:44:49 +0000	[diff] [blame]	657	PP.Diag(TokLoc, diag::ext_binary_literal);
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	658	++s;
				659	radix = 2;
				660	DigitsBegin = s;
				661	s = SkipBinaryDigits(s);
				662	if (s == ThisTokEnd) {
				663	// Done.
				664	} else if (isxdigit(*s)) {
Chris Lattner	ac92d82	2008-11-22 07:23:31 +0000	[diff] [blame]	665	PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
Chris Lattner	5f9e272	2011-07-23 10:55:15 +0000	[diff] [blame]	666	diag::err_invalid_binary_digit) << StringRef(s, 1);
Chris Lattner	ac92d82	2008-11-22 07:23:31 +0000	[diff] [blame]	667	hadError = true;
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	668	}
Chris Lattner	413d355	2008-06-30 06:44:49 +0000	[diff] [blame]	669	// Other suffixes will be diagnosed by the caller.
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	670	return;
				671	}
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	672
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	673	// For now, the radix is set to 8. If we discover that we have a
				674	// floating point constant, the radix will change to 10. Octal floating
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	675	// point constants are not permitted (only decimal and hexadecimal).
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	676	radix = 8;
				677	DigitsBegin = s;
				678	s = SkipOctalDigits(s);
				679	if (s == ThisTokEnd)
				680	return; // Done, simple octal number like 01234
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	681
Chris Lattner	413d355	2008-06-30 06:44:49 +0000	[diff] [blame]	682	// If we have some other non-octal digit that is a decimal digit, see if
				683	// this is part of a floating point number like 094.123 or 09e1.
				684	if (isdigit(*s)) {
				685	const char *EndDecimal = SkipDigits(s);
				686	if (EndDecimal[0] == '.' \|\| EndDecimal[0] == 'e' \|\| EndDecimal[0] == 'E') {
				687	s = EndDecimal;
				688	radix = 10;
				689	}
				690	}
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	691
Chris Lattner	413d355	2008-06-30 06:44:49 +0000	[diff] [blame]	692	// If we have a hex digit other than 'e' (which denotes a FP exponent) then
				693	// the code is using an incorrect base.
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	694	if (isxdigit(s) && s != 'e' && *s != 'E') {
Chris Lattner	ac92d82	2008-11-22 07:23:31 +0000	[diff] [blame]	695	PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
Chris Lattner	5f9e272	2011-07-23 10:55:15 +0000	[diff] [blame]	696	diag::err_invalid_octal_digit) << StringRef(s, 1);
Chris Lattner	ac92d82	2008-11-22 07:23:31 +0000	[diff] [blame]	697	hadError = true;
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	698	return;
				699	}
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	700
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	701	if (*s == '.') {
				702	s++;
				703	radix = 10;
				704	saw_period = true;
Chris Lattner	413d355	2008-06-30 06:44:49 +0000	[diff] [blame]	705	s = SkipDigits(s); // Skip suffix.
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	706	}
				707	if (s == 'e' \|\| s == 'E') { // exponent
				708	const char *Exponent = s;
				709	s++;
				710	radix = 10;
				711	saw_exponent = true;
				712	if (s == '+' \|\| s == '-') s++; // sign
				713	const char *first_non_digit = SkipDigits(s);
				714	if (first_non_digit != s) {
				715	s = first_non_digit;
				716	} else {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	717	PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
Chris Lattner	ac92d82	2008-11-22 07:23:31 +0000	[diff] [blame]	718	diag::err_exponent_has_no_digits);
				719	hadError = true;
Chris Lattner	368328c	2008-06-30 06:39:54 +0000	[diff] [blame]	720	return;
				721	}
				722	}
				723	}
				724
				725
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	726	/// GetIntegerValue - Convert this numeric literal value to an APInt that
				727	/// matches Val's input width. If there is an overflow, set Val to the low bits
				728	/// of the result and return true. Otherwise, return false.
				729	bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
Daniel Dunbar	a179be3	2008-10-16 07:32:01 +0000	[diff] [blame]	730	// Fast path: Compute a conservative bound on the maximum number of
				731	// bits per digit in this radix. If we can't possibly overflow a
				732	// uint64 based on that bound then do the simple conversion to
				733	// integer. This avoids the expensive overflow checking below, and
				734	// handles the common cases that matter (small decimal integers and
				735	// hex/octal values which don't overflow).
				736	unsigned MaxBitsPerDigit = 1;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	737	while ((1U << MaxBitsPerDigit) < radix)
Daniel Dunbar	a179be3	2008-10-16 07:32:01 +0000	[diff] [blame]	738	MaxBitsPerDigit += 1;
				739	if ((SuffixBegin - DigitsBegin) * MaxBitsPerDigit <= 64) {
				740	uint64_t N = 0;
				741	for (s = DigitsBegin; s != SuffixBegin; ++s)
				742	N = Nradix + HexDigitValue(s);
				743
				744	// This will truncate the value to Val's input width. Simply check
				745	// for overflow by comparing.
				746	Val = N;
				747	return Val.getZExtValue() != N;
				748	}
				749
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	750	Val = 0;
				751	s = DigitsBegin;
				752
				753	llvm::APInt RadixVal(Val.getBitWidth(), radix);
				754	llvm::APInt CharVal(Val.getBitWidth(), 0);
				755	llvm::APInt OldVal = Val;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	756
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	757	bool OverflowOccurred = false;
				758	while (s < SuffixBegin) {
				759	unsigned C = HexDigitValue(*s++);
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	760
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	761	// If this letter is out of bound for this radix, reject it.
				762	assert(C < radix && "NumericLiteralParser ctor should have rejected this");
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	763
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	764	CharVal = C;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	765
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	766	// Add the digit to the value in the appropriate radix. If adding in digits
				767	// made the value smaller, then this overflowed.
				768	OldVal = Val;
				769
				770	// Multiply by radix, did overflow occur on the multiply?
				771	Val *= RadixVal;
				772	OverflowOccurred \|= Val.udiv(RadixVal) != OldVal;
				773
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	774	// Add value, did overflow occur on the value?
Daniel Dunbar	d70cb64	2008-10-16 06:39:30 +0000	[diff] [blame]	775	// (a + b) ult b <=> overflow
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	776	Val += CharVal;
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	777	OverflowOccurred \|= Val.ult(CharVal);
				778	}
				779	return OverflowOccurred;
				780	}
				781
John McCall	94c939d	2009-12-24 09:08:04 +0000	[diff] [blame]	782	llvm::APFloat::opStatus
				783	NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
Ted Kremenek	427d5af	2007-11-26 23:12:30 +0000	[diff] [blame]	784	using llvm::APFloat;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	785
Erick Tryzelaar	e9f195f	2009-08-16 23:36:28 +0000	[diff] [blame]	786	unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
John McCall	94c939d	2009-12-24 09:08:04 +0000	[diff] [blame]	787	return Result.convertFromString(StringRef(ThisTokBegin, n),
				788	APFloat::rmNearestTiesToEven);
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	789	}
				790
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	791
James Dennett	58f9ce1	2012-06-17 03:34:42 +0000	[diff] [blame]	792	/// \verbatim
Richard Smith	5cc2c6e	2012-03-05 04:02:15 +0000	[diff] [blame]	793	/// user-defined-character-literal: [C++11 lex.ext]
				794	/// character-literal ud-suffix
				795	/// ud-suffix:
				796	/// identifier
				797	/// character-literal: [C++11 lex.ccon]
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	798	/// ' c-char-sequence '
				799	/// u' c-char-sequence '
				800	/// U' c-char-sequence '
				801	/// L' c-char-sequence '
				802	/// c-char-sequence:
				803	/// c-char
				804	/// c-char-sequence c-char
				805	/// c-char:
				806	/// any member of the source character set except the single-quote ',
				807	/// backslash \, or new-line character
				808	/// escape-sequence
				809	/// universal-character-name
Richard Smith	5cc2c6e	2012-03-05 04:02:15 +0000	[diff] [blame]	810	/// escape-sequence:
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	811	/// simple-escape-sequence
				812	/// octal-escape-sequence
				813	/// hexadecimal-escape-sequence
				814	/// simple-escape-sequence:
NAKAMURA Takumi	ddddd48	2011-08-12 05:49:51 +0000	[diff] [blame]	815	/// one of \' \" \? \\ \a \b \f \n \r \t \v
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	816	/// octal-escape-sequence:
				817	/// \ octal-digit
				818	/// \ octal-digit octal-digit
				819	/// \ octal-digit octal-digit octal-digit
				820	/// hexadecimal-escape-sequence:
				821	/// \x hexadecimal-digit
				822	/// hexadecimal-escape-sequence hexadecimal-digit
Richard Smith	5cc2c6e	2012-03-05 04:02:15 +0000	[diff] [blame]	823	/// universal-character-name: [C++11 lex.charset]
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	824	/// \u hex-quad
				825	/// \U hex-quad hex-quad
				826	/// hex-quad:
				827	/// hex-digit hex-digit hex-digit hex-digit
James Dennett	58f9ce1	2012-06-17 03:34:42 +0000	[diff] [blame]	828	/// \endverbatim
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	829	///
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	830	CharLiteralParser::CharLiteralParser(const char begin, const char end,
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	831	SourceLocation Loc, Preprocessor &PP,
				832	tok::TokenKind kind) {
Seth Cantrell	be77352	2012-01-18 12:27:04 +0000	[diff] [blame]	833	// At this point we know that the character matches the regex "(L\|u\|U)?'.*'".
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	834	HadError = false;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	835
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	836	Kind = kind;
				837
Richard Smith	26b75c0	2012-03-09 22:27:51 +0000	[diff] [blame]	838	const char *TokBegin = begin;
				839
Seth Cantrell	be77352	2012-01-18 12:27:04 +0000	[diff] [blame]	840	// Skip over wide character determinant.
				841	if (Kind != tok::char_constant) {
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	842	++begin;
				843	}
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	844
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	845	// Skip over the entry quote.
				846	assert(begin[0] == '\'' && "Invalid token lexed");
				847	++begin;
				848
Richard Smith	5cc2c6e	2012-03-05 04:02:15 +0000	[diff] [blame]	849	// Remove an optional ud-suffix.
				850	if (end[-1] != '\'') {
				851	const char *UDSuffixEnd = end;
				852	do {
				853	--end;
				854	} while (end[-1] != '\'');
				855	UDSuffixBuf.assign(end, UDSuffixEnd);
Richard Smith	26b75c0	2012-03-09 22:27:51 +0000	[diff] [blame]	856	UDSuffixOffset = end - TokBegin;
Richard Smith	5cc2c6e	2012-03-05 04:02:15 +0000	[diff] [blame]	857	}
				858
Seth Cantrell	be77352	2012-01-18 12:27:04 +0000	[diff] [blame]	859	// Trim the ending quote.
Richard Smith	5cc2c6e	2012-03-05 04:02:15 +0000	[diff] [blame]	860	assert(end != begin && "Invalid token lexed");
Seth Cantrell	be77352	2012-01-18 12:27:04 +0000	[diff] [blame]	861	--end;
				862
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	863	// FIXME: The "Value" is an uint64_t so we can handle char literals of
Chris Lattner	fc8f0e1	2011-04-15 05:22:18 +0000	[diff] [blame]	864	// up to 64-bits.
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	865	// FIXME: This extensively assumes that 'char' is 8-bits.
Chris Lattner	98be494	2008-03-05 18:54:05 +0000	[diff] [blame]	866	assert(PP.getTargetInfo().getCharWidth() == 8 &&
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	867	"Assumes char is 8 bits");
Chris Lattner	e3ad881	2009-04-28 21:51:46 +0000	[diff] [blame]	868	assert(PP.getTargetInfo().getIntWidth() <= 64 &&
				869	(PP.getTargetInfo().getIntWidth() & 7) == 0 &&
				870	"Assumes sizeof(int) on target is <= 64 and a multiple of char");
				871	assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
				872	"Assumes sizeof(wchar) on target is <= 64");
Sanjiv Gupta	4bc11af	2009-04-21 02:21:29 +0000	[diff] [blame]	873
Seth Cantrell	be77352	2012-01-18 12:27:04 +0000	[diff] [blame]	874	SmallVector<uint32_t,4> codepoint_buffer;
				875	codepoint_buffer.resize(end-begin);
				876	uint32_t *buffer_begin = &codepoint_buffer.front();
				877	uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	878
Seth Cantrell	be77352	2012-01-18 12:27:04 +0000	[diff] [blame]	879	// Unicode escapes representing characters that cannot be correctly
				880	// represented in a single code unit are disallowed in character literals
				881	// by this implementation.
				882	uint32_t largest_character_for_kind;
				883	if (tok::wide_char_constant == Kind) {
				884	largest_character_for_kind = 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
				885	} else if (tok::utf16_char_constant == Kind) {
				886	largest_character_for_kind = 0xFFFF;
				887	} else if (tok::utf32_char_constant == Kind) {
				888	largest_character_for_kind = 0x10FFFF;
				889	} else {
				890	largest_character_for_kind = 0x7Fu;
Chris Lattner	e3ad881	2009-04-28 21:51:46 +0000	[diff] [blame]	891	}
				892
Seth Cantrell	be77352	2012-01-18 12:27:04 +0000	[diff] [blame]	893	while (begin!=end) {
				894	// Is this a span of non-escape characters?
				895	if (begin[0] != '\\') {
				896	char const *start = begin;
				897	do {
				898	++begin;
				899	} while (begin != end && *begin != '\\');
				900
Eli Friedman	9135930	2012-02-11 05:08:10 +0000	[diff] [blame]	901	char const *tmp_in_start = start;
				902	uint32_t *tmp_out_start = buffer_begin;
Seth Cantrell	be77352	2012-01-18 12:27:04 +0000	[diff] [blame]	903	ConversionResult res =
				904	ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
				905	reinterpret_cast<UTF8 const *>(begin),
				906	&buffer_begin,buffer_end,strictConversion);
				907	if (res!=conversionOK) {
Eli Friedman	9135930	2012-02-11 05:08:10 +0000	[diff] [blame]	908	// If we see bad encoding for unprefixed character literals, warn and
				909	// simply copy the byte values, for compatibility with gcc and
				910	// older versions of clang.
				911	bool NoErrorOnBadEncoding = isAscii();
				912	unsigned Msg = diag::err_bad_character_encoding;
				913	if (NoErrorOnBadEncoding)
				914	Msg = diag::warn_bad_character_encoding;
				915	PP.Diag(Loc, Msg);
				916	if (NoErrorOnBadEncoding) {
				917	start = tmp_in_start;
				918	buffer_begin = tmp_out_start;
				919	for ( ; start != begin; ++start, ++buffer_begin)
				920	buffer_begin = static_cast<uint8_t>(start);
				921	} else {
				922	HadError = true;
				923	}
Seth Cantrell	be77352	2012-01-18 12:27:04 +0000	[diff] [blame]	924	} else {
Eli Friedman	9135930	2012-02-11 05:08:10 +0000	[diff] [blame]	925	for (; tmp_out_start <buffer_begin; ++tmp_out_start) {
				926	if (*tmp_out_start > largest_character_for_kind) {
Seth Cantrell	be77352	2012-01-18 12:27:04 +0000	[diff] [blame]	927	HadError = true;
				928	PP.Diag(Loc, diag::err_character_too_large);
				929	}
				930	}
				931	}
				932
				933	continue;
				934	}
				935	// Is this a Universal Character Name excape?
				936	if (begin[1] == 'u' \|\| begin[1] == 'U') {
				937	unsigned short UcnLen = 0;
Richard Smith	26b75c0	2012-03-09 22:27:51 +0000	[diff] [blame]	938	if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
Seth Cantrell	be77352	2012-01-18 12:27:04 +0000	[diff] [blame]	939	FullSourceLoc(Loc, PP.getSourceManager()),
David Blaikie	4e4d084	2012-03-11 07:00:24 +0000	[diff] [blame]	940	&PP.getDiagnostics(), PP.getLangOpts(),
Seth Cantrell	be77352	2012-01-18 12:27:04 +0000	[diff] [blame]	941	true))
				942	{
				943	HadError = true;
				944	} else if (*buffer_begin > largest_character_for_kind) {
				945	HadError = true;
				946	PP.Diag(Loc,diag::err_character_too_large);
				947	}
				948
				949	++buffer_begin;
				950	continue;
				951	}
				952	unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
				953	uint64_t result =
				954	ProcessCharEscape(begin, end, HadError,
				955	FullSourceLoc(Loc,PP.getSourceManager()),
				956	CharWidth, &PP.getDiagnostics());
				957	*buffer_begin++ = result;
				958	}
				959
				960	unsigned NumCharsSoFar = buffer_begin-&codepoint_buffer.front();
				961
Chris Lattner	e3ad881	2009-04-28 21:51:46 +0000	[diff] [blame]	962	if (NumCharsSoFar > 1) {
Seth Cantrell	be77352	2012-01-18 12:27:04 +0000	[diff] [blame]	963	if (isWide())
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	964	PP.Diag(Loc, diag::warn_extraneous_char_constant);
Seth Cantrell	be77352	2012-01-18 12:27:04 +0000	[diff] [blame]	965	else if (isAscii() && NumCharsSoFar == 4)
				966	PP.Diag(Loc, diag::ext_four_char_character_literal);
				967	else if (isAscii())
Chris Lattner	e3ad881	2009-04-28 21:51:46 +0000	[diff] [blame]	968	PP.Diag(Loc, diag::ext_multichar_character_literal);
				969	else
Seth Cantrell	be77352	2012-01-18 12:27:04 +0000	[diff] [blame]	970	PP.Diag(Loc, diag::err_multichar_utf_character_literal);
Eli Friedman	2a1c363	2009-06-01 05:25:02 +0000	[diff] [blame]	971	IsMultiChar = true;
Daniel Dunbar	930b71a	2009-07-29 01:46:05 +0000	[diff] [blame]	972	} else
				973	IsMultiChar = false;
Sanjiv Gupta	4bc11af	2009-04-21 02:21:29 +0000	[diff] [blame]	974
Seth Cantrell	be77352	2012-01-18 12:27:04 +0000	[diff] [blame]	975	llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
				976
				977	// Narrow character literals act as though their value is concatenated
				978	// in this implementation, but warn on overflow.
				979	bool multi_char_too_long = false;
				980	if (isAscii() && isMultiChar()) {
				981	LitVal = 0;
				982	for (size_t i=0;i<NumCharsSoFar;++i) {
				983	// check for enough leading zeros to shift into
				984	multi_char_too_long \|= (LitVal.countLeadingZeros() < 8);
				985	LitVal <<= 8;
				986	LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
				987	}
				988	} else if (NumCharsSoFar > 0) {
				989	// otherwise just take the last character
				990	LitVal = buffer_begin[-1];
				991	}
				992
				993	if (!HadError && multi_char_too_long) {
				994	PP.Diag(Loc,diag::warn_char_constant_too_large);
				995	}
				996
Sanjiv Gupta	4bc11af	2009-04-21 02:21:29 +0000	[diff] [blame]	997	// Transfer the value from APInt to uint64_t
				998	Value = LitVal.getZExtValue();
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	999
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1000	// If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
				1001	// if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
				1002	// character constants are not sign extended in the this implementation:
				1003	// '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	1004	if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
David Blaikie	4e4d084	2012-03-11 07:00:24 +0000	[diff] [blame]	1005	PP.getLangOpts().CharIsSigned)
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1006	Value = (signed char)Value;
				1007	}
				1008
James Dennett	a1263cf	2012-06-19 21:04:25 +0000	[diff] [blame^]	1009	/// \verbatim
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1010	/// string-literal: [C++0x lex.string]
				1011	/// encoding-prefix " [s-char-sequence] "
				1012	/// encoding-prefix R raw-string
				1013	/// encoding-prefix:
				1014	/// u8
				1015	/// u
				1016	/// U
				1017	/// L
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1018	/// s-char-sequence:
				1019	/// s-char
				1020	/// s-char-sequence s-char
				1021	/// s-char:
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1022	/// any member of the source character set except the double-quote ",
				1023	/// backslash \, or new-line character
				1024	/// escape-sequence
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1025	/// universal-character-name
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1026	/// raw-string:
				1027	/// " d-char-sequence ( r-char-sequence ) d-char-sequence "
				1028	/// r-char-sequence:
				1029	/// r-char
				1030	/// r-char-sequence r-char
				1031	/// r-char:
				1032	/// any member of the source character set, except a right parenthesis )
				1033	/// followed by the initial d-char-sequence (which may be empty)
				1034	/// followed by a double quote ".
				1035	/// d-char-sequence:
				1036	/// d-char
				1037	/// d-char-sequence d-char
				1038	/// d-char:
				1039	/// any member of the basic source character set except:
				1040	/// space, the left parenthesis (, the right parenthesis ),
				1041	/// the backslash \, and the control characters representing horizontal
				1042	/// tab, vertical tab, form feed, and newline.
				1043	/// escape-sequence: [C++0x lex.ccon]
				1044	/// simple-escape-sequence
				1045	/// octal-escape-sequence
				1046	/// hexadecimal-escape-sequence
				1047	/// simple-escape-sequence:
NAKAMURA Takumi	ddddd48	2011-08-12 05:49:51 +0000	[diff] [blame]	1048	/// one of \' \" \? \\ \a \b \f \n \r \t \v
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1049	/// octal-escape-sequence:
				1050	/// \ octal-digit
				1051	/// \ octal-digit octal-digit
				1052	/// \ octal-digit octal-digit octal-digit
				1053	/// hexadecimal-escape-sequence:
				1054	/// \x hexadecimal-digit
				1055	/// hexadecimal-escape-sequence hexadecimal-digit
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1056	/// universal-character-name:
				1057	/// \u hex-quad
				1058	/// \U hex-quad hex-quad
				1059	/// hex-quad:
				1060	/// hex-digit hex-digit hex-digit hex-digit
James Dennett	a1263cf	2012-06-19 21:04:25 +0000	[diff] [blame^]	1061	/// \endverbatim
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1062	///
				1063	StringLiteralParser::
Chris Lattner	d217773	2007-07-20 16:59:19 +0000	[diff] [blame]	1064	StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
Chris Lattner	0833dd0	2010-11-17 07:21:13 +0000	[diff] [blame]	1065	Preprocessor &PP, bool Complain)
David Blaikie	4e4d084	2012-03-11 07:00:24 +0000	[diff] [blame]	1066	: SM(PP.getSourceManager()), Features(PP.getLangOpts()),
Argyrios Kyrtzidis	403de3f	2011-05-17 22:09:56 +0000	[diff] [blame]	1067	Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0),
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	1068	MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
				1069	ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
Chris Lattner	0833dd0	2010-11-17 07:21:13 +0000	[diff] [blame]	1070	init(StringToks, NumStringToks);
				1071	}
				1072
				1073	void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
Argyrios Kyrtzidis	403de3f	2011-05-17 22:09:56 +0000	[diff] [blame]	1074	// The literal token may have come from an invalid source location (e.g. due
				1075	// to a PCH error), in which case the token length will be 0.
Argyrios Kyrtzidis	3144749	2012-05-03 17:50:32 +0000	[diff] [blame]	1076	if (NumStringToks == 0 \|\| StringToks[0].getLength() < 2)
				1077	return DiagnoseLexingError(SourceLocation());
Argyrios Kyrtzidis	403de3f	2011-05-17 22:09:56 +0000	[diff] [blame]	1078
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1079	// Scan all of the string portions, remember the max individual token length,
				1080	// computing a bound on the concatenated string length, and see whether any
				1081	// piece is a wide-string. If any of the string portions is a wide-string
				1082	// literal, the result is a wide-string literal [C99 6.4.5p4].
Argyrios Kyrtzidis	403de3f	2011-05-17 22:09:56 +0000	[diff] [blame]	1083	assert(NumStringToks && "expected at least one token");
Sean Hunt	6cf7502	2010-08-30 17:47:05 +0000	[diff] [blame]	1084	MaxTokenLength = StringToks[0].getLength();
Argyrios Kyrtzidis	403de3f	2011-05-17 22:09:56 +0000	[diff] [blame]	1085	assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
Sean Hunt	6cf7502	2010-08-30 17:47:05 +0000	[diff] [blame]	1086	SizeBound = StringToks[0].getLength()-2; // -2 for "".
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	1087	Kind = StringToks[0].getKind();
Sean Hunt	6cf7502	2010-08-30 17:47:05 +0000	[diff] [blame]	1088
				1089	hadError = false;
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1090
				1091	// Implement Translation Phase #6: concatenation of string literals
				1092	/// (C99 5.1.1.2p1). The common case is only one string fragment.
				1093	for (unsigned i = 1; i != NumStringToks; ++i) {
Argyrios Kyrtzidis	3144749	2012-05-03 17:50:32 +0000	[diff] [blame]	1094	if (StringToks[i].getLength() < 2)
				1095	return DiagnoseLexingError(StringToks[i].getLocation());
Argyrios Kyrtzidis	403de3f	2011-05-17 22:09:56 +0000	[diff] [blame]	1096
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1097	// The string could be shorter than this if it needs cleaning, but this is a
				1098	// reasonable bound, which is all we need.
Argyrios Kyrtzidis	403de3f	2011-05-17 22:09:56 +0000	[diff] [blame]	1099	assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
Sean Hunt	6cf7502	2010-08-30 17:47:05 +0000	[diff] [blame]	1100	SizeBound += StringToks[i].getLength()-2; // -2 for "".
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1101
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1102	// Remember maximum string piece length.
Sean Hunt	6cf7502	2010-08-30 17:47:05 +0000	[diff] [blame]	1103	if (StringToks[i].getLength() > MaxTokenLength)
				1104	MaxTokenLength = StringToks[i].getLength();
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1105
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	1106	// Remember if we see any wide or utf-8/16/32 strings.
				1107	// Also check for illegal concatenations.
				1108	if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
				1109	if (isAscii()) {
				1110	Kind = StringToks[i].getKind();
				1111	} else {
				1112	if (Diags)
				1113	Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM),
				1114	diag::err_unsupported_string_concat);
				1115	hadError = true;
				1116	}
				1117	}
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1118	}
Chris Lattner	dbb1ecc	2009-02-26 23:01:51 +0000	[diff] [blame]	1119
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1120	// Include space for the null terminator.
				1121	++SizeBound;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1122
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1123	// TODO: K&R warning: "traditional C rejects string constant concatenation"
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1124
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	1125	// Get the width in bytes of char/wchar_t/char16_t/char32_t
				1126	CharByteWidth = getCharWidth(Kind, Target);
				1127	assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
				1128	CharByteWidth /= 8;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1129
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1130	// The output buffer size needs to be large enough to hold wide characters.
				1131	// This is a worst-case assumption which basically corresponds to L"" "long".
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	1132	SizeBound *= CharByteWidth;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1133
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1134	// Size the temporary buffer to hold the result string data.
				1135	ResultBuf.resize(SizeBound);
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1136
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1137	// Likewise, but for each string piece.
Dylan Noblesmith	f7ccbad	2012-02-05 02:13:05 +0000	[diff] [blame]	1138	SmallString<512> TokenBuf;
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1139	TokenBuf.resize(MaxTokenLength);
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1140
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1141	// Loop over all the strings, getting their spelling, and expanding them to
				1142	// wide strings as appropriate.
				1143	ResultPtr = &ResultBuf[0]; // Next byte to fill in.
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1144
Anders Carlsson	ee98ac5	2007-10-15 02:50:23 +0000	[diff] [blame]	1145	Pascal = false;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1146
Richard Smith	5cc2c6e	2012-03-05 04:02:15 +0000	[diff] [blame]	1147	SourceLocation UDSuffixTokLoc;
				1148
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1149	for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
				1150	const char *ThisTokBuf = &TokenBuf[0];
				1151	// Get the spelling of the token, which eliminates trigraphs, etc. We know
				1152	// that ThisTokBuf points to a buffer that is big enough for the whole token
				1153	// and 'spelled' tokens can only shrink.
Douglas Gregor	50f6af7	2010-03-16 05:20:39 +0000	[diff] [blame]	1154	bool StringInvalid = false;
Chris Lattner	0833dd0	2010-11-17 07:21:13 +0000	[diff] [blame]	1155	unsigned ThisTokLen =
Chris Lattner	b060727	2010-11-17 07:26:20 +0000	[diff] [blame]	1156	Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
				1157	&StringInvalid);
Argyrios Kyrtzidis	3144749	2012-05-03 17:50:32 +0000	[diff] [blame]	1158	if (StringInvalid)
				1159	return DiagnoseLexingError(StringToks[i].getLocation());
Douglas Gregor	50f6af7	2010-03-16 05:20:39 +0000	[diff] [blame]	1160
Richard Smith	26b75c0	2012-03-09 22:27:51 +0000	[diff] [blame]	1161	const char *ThisTokBegin = ThisTokBuf;
Richard Smith	5cc2c6e	2012-03-05 04:02:15 +0000	[diff] [blame]	1162	const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
				1163
				1164	// Remove an optional ud-suffix.
				1165	if (ThisTokEnd[-1] != '"') {
				1166	const char *UDSuffixEnd = ThisTokEnd;
				1167	do {
				1168	--ThisTokEnd;
				1169	} while (ThisTokEnd[-1] != '"');
				1170
				1171	StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
				1172
				1173	if (UDSuffixBuf.empty()) {
				1174	UDSuffixBuf.assign(UDSuffix);
Richard Smith	dd66be7	2012-03-08 01:34:56 +0000	[diff] [blame]	1175	UDSuffixToken = i;
				1176	UDSuffixOffset = ThisTokEnd - ThisTokBuf;
Richard Smith	5cc2c6e	2012-03-05 04:02:15 +0000	[diff] [blame]	1177	UDSuffixTokLoc = StringToks[i].getLocation();
				1178	} else if (!UDSuffixBuf.equals(UDSuffix)) {
				1179	// C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
				1180	// result of a concatenation involving at least one user-defined-string-
				1181	// literal, all the participating user-defined-string-literals shall
				1182	// have the same ud-suffix.
				1183	if (Diags) {
				1184	SourceLocation TokLoc = StringToks[i].getLocation();
				1185	Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
				1186	<< UDSuffixBuf << UDSuffix
				1187	<< SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
				1188	<< SourceRange(TokLoc, TokLoc);
				1189	}
				1190	hadError = true;
				1191	}
				1192	}
				1193
				1194	// Strip the end quote.
				1195	--ThisTokEnd;
				1196
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1197	// TODO: Input character set mapping support.
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1198
Craig Topper	1661d71	2011-08-08 06:10:39 +0000	[diff] [blame]	1199	// Skip marker for wide or unicode strings.
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	1200	if (ThisTokBuf[0] == 'L' \|\| ThisTokBuf[0] == 'u' \|\| ThisTokBuf[0] == 'U') {
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1201	++ThisTokBuf;
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	1202	// Skip 8 of u8 marker for utf8 strings.
				1203	if (ThisTokBuf[0] == '8')
				1204	++ThisTokBuf;
Fariborz Jahanian	56bedef	2010-08-31 23:34:27 +0000	[diff] [blame]	1205	}
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1206
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1207	// Check for raw string
				1208	if (ThisTokBuf[0] == 'R') {
				1209	ThisTokBuf += 2; // skip R"
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1210
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1211	const char *Prefix = ThisTokBuf;
				1212	while (ThisTokBuf[0] != '(')
Anders Carlsson	ee98ac5	2007-10-15 02:50:23 +0000	[diff] [blame]	1213	++ThisTokBuf;
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1214	++ThisTokBuf; // skip '('
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1215
Richard Smith	49d5174	2012-03-08 21:59:28 +0000	[diff] [blame]	1216	// Remove same number of characters from the end
				1217	ThisTokEnd -= ThisTokBuf - Prefix;
				1218	assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1219
				1220	// Copy the string over
Richard Smith	49d5174	2012-03-08 21:59:28 +0000	[diff] [blame]	1221	if (CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)))
Eli Friedman	9135930	2012-02-11 05:08:10 +0000	[diff] [blame]	1222	if (DiagnoseBadString(StringToks[i]))
				1223	hadError = true;
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1224	} else {
Argyrios Kyrtzidis	07a0758	2012-05-03 01:01:56 +0000	[diff] [blame]	1225	if (ThisTokBuf[0] != '"') {
				1226	// The file may have come from PCH and then changed after loading the
				1227	// PCH; Fail gracefully.
Argyrios Kyrtzidis	3144749	2012-05-03 17:50:32 +0000	[diff] [blame]	1228	return DiagnoseLexingError(StringToks[i].getLocation());
Argyrios Kyrtzidis	07a0758	2012-05-03 01:01:56 +0000	[diff] [blame]	1229	}
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1230	++ThisTokBuf; // skip "
				1231
				1232	// Check if this is a pascal string
				1233	if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
				1234	ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
				1235
				1236	// If the \p sequence is found in the first token, we have a pascal string
				1237	// Otherwise, if we already have a pascal string, ignore the first \p
				1238	if (i == 0) {
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1239	++ThisTokBuf;
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1240	Pascal = true;
				1241	} else if (Pascal)
				1242	ThisTokBuf += 2;
				1243	}
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1244
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1245	while (ThisTokBuf != ThisTokEnd) {
				1246	// Is this a span of non-escape characters?
				1247	if (ThisTokBuf[0] != '\\') {
				1248	const char *InStart = ThisTokBuf;
				1249	do {
				1250	++ThisTokBuf;
				1251	} while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
				1252
				1253	// Copy the character span over.
Richard Smith	49d5174	2012-03-08 21:59:28 +0000	[diff] [blame]	1254	if (CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart)))
Eli Friedman	9135930	2012-02-11 05:08:10 +0000	[diff] [blame]	1255	if (DiagnoseBadString(StringToks[i]))
				1256	hadError = true;
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1257	continue;
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1258	}
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1259	// Is this a Universal Character Name escape?
				1260	if (ThisTokBuf[1] == 'u' \|\| ThisTokBuf[1] == 'U') {
Richard Smith	26b75c0	2012-03-09 22:27:51 +0000	[diff] [blame]	1261	EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
				1262	ResultPtr, hadError,
				1263	FullSourceLoc(StringToks[i].getLocation(), SM),
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1264	CharByteWidth, Diags, Features);
				1265	continue;
				1266	}
				1267	// Otherwise, this is a non-UCN escape character. Process it.
				1268	unsigned ResultChar =
				1269	ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
				1270	FullSourceLoc(StringToks[i].getLocation(), SM),
				1271	CharByteWidth*8, Diags);
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1272
Eli Friedman	caf1f26	2011-11-02 23:06:23 +0000	[diff] [blame]	1273	if (CharByteWidth == 4) {
				1274	// FIXME: Make the type of the result buffer correct instead of
				1275	// using reinterpret_cast.
				1276	UTF32 ResultWidePtr = reinterpret_cast<UTF32>(ResultPtr);
Nico Weber	9b483df	2011-11-14 05:17:37 +0000	[diff] [blame]	1277	*ResultWidePtr = ResultChar;
Eli Friedman	caf1f26	2011-11-02 23:06:23 +0000	[diff] [blame]	1278	ResultPtr += 4;
				1279	} else if (CharByteWidth == 2) {
				1280	// FIXME: Make the type of the result buffer correct instead of
				1281	// using reinterpret_cast.
				1282	UTF16 ResultWidePtr = reinterpret_cast<UTF16>(ResultPtr);
Nico Weber	9b483df	2011-11-14 05:17:37 +0000	[diff] [blame]	1283	*ResultWidePtr = ResultChar & 0xFFFF;
Eli Friedman	caf1f26	2011-11-02 23:06:23 +0000	[diff] [blame]	1284	ResultPtr += 2;
				1285	} else {
				1286	assert(CharByteWidth == 1 && "Unexpected char width");
				1287	*ResultPtr++ = ResultChar & 0xFF;
				1288	}
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1289	}
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1290	}
				1291	}
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1292
Chris Lattner	bbee00b	2009-01-16 18:51:42 +0000	[diff] [blame]	1293	if (Pascal) {
Eli Friedman	22508f4	2011-11-05 00:41:04 +0000	[diff] [blame]	1294	if (CharByteWidth == 4) {
				1295	// FIXME: Make the type of the result buffer correct instead of
				1296	// using reinterpret_cast.
				1297	UTF32 ResultWidePtr = reinterpret_cast<UTF32>(ResultBuf.data());
				1298	ResultWidePtr[0] = GetNumStringChars() - 1;
				1299	} else if (CharByteWidth == 2) {
				1300	// FIXME: Make the type of the result buffer correct instead of
				1301	// using reinterpret_cast.
				1302	UTF16 ResultWidePtr = reinterpret_cast<UTF16>(ResultBuf.data());
				1303	ResultWidePtr[0] = GetNumStringChars() - 1;
				1304	} else {
				1305	assert(CharByteWidth == 1 && "Unexpected char width");
				1306	ResultBuf[0] = GetNumStringChars() - 1;
				1307	}
Chris Lattner	bbee00b	2009-01-16 18:51:42 +0000	[diff] [blame]	1308
				1309	// Verify that pascal strings aren't too large.
Chris Lattner	0833dd0	2010-11-17 07:21:13 +0000	[diff] [blame]	1310	if (GetStringLength() > 256) {
				1311	if (Diags)
				1312	Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
				1313	diag::err_pascal_string_too_long)
				1314	<< SourceRange(StringToks[0].getLocation(),
				1315	StringToks[NumStringToks-1].getLocation());
Douglas Gregor	5cee119	2011-07-27 05:40:30 +0000	[diff] [blame]	1316	hadError = true;
Eli Friedman	57d7dde	2009-04-01 03:17:08 +0000	[diff] [blame]	1317	return;
				1318	}
Chris Lattner	0833dd0	2010-11-17 07:21:13 +0000	[diff] [blame]	1319	} else if (Diags) {
Douglas Gregor	427c492	2010-07-20 14:33:20 +0000	[diff] [blame]	1320	// Complain if this string literal has too many characters.
Chris Lattner	a95880d	2010-11-17 07:12:42 +0000	[diff] [blame]	1321	unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
Douglas Gregor	427c492	2010-07-20 14:33:20 +0000	[diff] [blame]	1322
				1323	if (GetNumStringChars() > MaxChars)
Chris Lattner	0833dd0	2010-11-17 07:21:13 +0000	[diff] [blame]	1324	Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
				1325	diag::ext_string_too_long)
Douglas Gregor	427c492	2010-07-20 14:33:20 +0000	[diff] [blame]	1326	<< GetNumStringChars() << MaxChars
Chris Lattner	a95880d	2010-11-17 07:12:42 +0000	[diff] [blame]	1327	<< (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
Douglas Gregor	427c492	2010-07-20 14:33:20 +0000	[diff] [blame]	1328	<< SourceRange(StringToks[0].getLocation(),
				1329	StringToks[NumStringToks-1].getLocation());
Chris Lattner	bbee00b	2009-01-16 18:51:42 +0000	[diff] [blame]	1330	}
Reid Spencer	5f016e2	2007-07-11 17:01:13 +0000	[diff] [blame]	1331	}
Chris Lattner	719e615	2009-02-18 19:21:10 +0000	[diff] [blame]	1332
				1333
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1334	/// copyStringFragment - This function copies from Start to End into ResultPtr.
				1335	/// Performs widening for multi-byte characters.
Eli Friedman	f74a458	2011-11-01 02:14:50 +0000	[diff] [blame]	1336	bool StringLiteralParser::CopyStringFragment(StringRef Fragment) {
				1337	assert(CharByteWidth==1 \|\| CharByteWidth==2 \|\| CharByteWidth==4);
				1338	ConversionResult result = conversionOK;
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1339	// Copy the character span over.
				1340	if (CharByteWidth == 1) {
Richard Smith	49d5174	2012-03-08 21:59:28 +0000	[diff] [blame]	1341	if (!isLegalUTF8String(reinterpret_cast<const UTF8*>(Fragment.begin()),
				1342	reinterpret_cast<const UTF8*>(Fragment.end())))
Eli Friedman	9135930	2012-02-11 05:08:10 +0000	[diff] [blame]	1343	result = sourceIllegal;
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1344	memcpy(ResultPtr, Fragment.data(), Fragment.size());
				1345	ResultPtr += Fragment.size();
Eli Friedman	f74a458	2011-11-01 02:14:50 +0000	[diff] [blame]	1346	} else if (CharByteWidth == 2) {
				1347	UTF8 const sourceStart = (UTF8 const )Fragment.data();
				1348	// FIXME: Make the type of the result buffer correct instead of
				1349	// using reinterpret_cast.
				1350	UTF16 targetStart = reinterpret_cast<UTF16>(ResultPtr);
Eli Friedman	9135930	2012-02-11 05:08:10 +0000	[diff] [blame]	1351	ConversionFlags flags = strictConversion;
Eli Friedman	f74a458	2011-11-01 02:14:50 +0000	[diff] [blame]	1352	result = ConvertUTF8toUTF16(
				1353	&sourceStart,sourceStart + Fragment.size(),
				1354	&targetStart,targetStart + 2*Fragment.size(),flags);
				1355	if (result==conversionOK)
				1356	ResultPtr = reinterpret_cast<char*>(targetStart);
				1357	} else if (CharByteWidth == 4) {
				1358	UTF8 const sourceStart = (UTF8 const )Fragment.data();
				1359	// FIXME: Make the type of the result buffer correct instead of
				1360	// using reinterpret_cast.
				1361	UTF32 targetStart = reinterpret_cast<UTF32>(ResultPtr);
Eli Friedman	9135930	2012-02-11 05:08:10 +0000	[diff] [blame]	1362	ConversionFlags flags = strictConversion;
Eli Friedman	f74a458	2011-11-01 02:14:50 +0000	[diff] [blame]	1363	result = ConvertUTF8toUTF32(
				1364	&sourceStart,sourceStart + Fragment.size(),
				1365	&targetStart,targetStart + 4*Fragment.size(),flags);
				1366	if (result==conversionOK)
				1367	ResultPtr = reinterpret_cast<char*>(targetStart);
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1368	}
Eli Friedman	f74a458	2011-11-01 02:14:50 +0000	[diff] [blame]	1369	assert((result != targetExhausted)
				1370	&& "ConvertUTF8toUTFXX exhausted target buffer");
				1371	return result != conversionOK;
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1372	}
				1373
Eli Friedman	9135930	2012-02-11 05:08:10 +0000	[diff] [blame]	1374	bool StringLiteralParser::DiagnoseBadString(const Token &Tok) {
				1375	// If we see bad encoding for unprefixed string literals, warn and
				1376	// simply copy the byte values, for compatibility with gcc and older
				1377	// versions of clang.
				1378	bool NoErrorOnBadEncoding = isAscii();
				1379	unsigned Msg = NoErrorOnBadEncoding ? diag::warn_bad_string_encoding :
				1380	diag::err_bad_string_encoding;
				1381	if (Diags)
				1382	Diags->Report(FullSourceLoc(Tok.getLocation(), SM), Msg);
				1383	return !NoErrorOnBadEncoding;
				1384	}
Craig Topper	2fa4e86	2011-08-11 04:06:15 +0000	[diff] [blame]	1385
Argyrios Kyrtzidis	3144749	2012-05-03 17:50:32 +0000	[diff] [blame]	1386	void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
				1387	hadError = true;
				1388	if (Diags)
				1389	Diags->Report(Loc, diag::err_lexing_string);
				1390	}
				1391
Chris Lattner	719e615	2009-02-18 19:21:10 +0000	[diff] [blame]	1392	/// getOffsetOfStringByte - This function returns the offset of the
				1393	/// specified byte of the string data represented by Token. This handles
				1394	/// advancing over escape sequences in the string.
				1395	unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
Chris Lattner	6c66f07	2010-11-17 06:46:14 +0000	[diff] [blame]	1396	unsigned ByteNo) const {
Chris Lattner	719e615	2009-02-18 19:21:10 +0000	[diff] [blame]	1397	// Get the spelling of the token.
Dylan Noblesmith	f7ccbad	2012-02-05 02:13:05 +0000	[diff] [blame]	1398	SmallString<32> SpellingBuffer;
Sean Hunt	6cf7502	2010-08-30 17:47:05 +0000	[diff] [blame]	1399	SpellingBuffer.resize(Tok.getLength());
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1400
Douglas Gregor	50f6af7	2010-03-16 05:20:39 +0000	[diff] [blame]	1401	bool StringInvalid = false;
Chris Lattner	719e615	2009-02-18 19:21:10 +0000	[diff] [blame]	1402	const char *SpellingPtr = &SpellingBuffer[0];
Chris Lattner	b060727	2010-11-17 07:26:20 +0000	[diff] [blame]	1403	unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
				1404	&StringInvalid);
Chris Lattner	91f54ce	2010-11-17 06:26:08 +0000	[diff] [blame]	1405	if (StringInvalid)
Douglas Gregor	50f6af7	2010-03-16 05:20:39 +0000	[diff] [blame]	1406	return 0;
Chris Lattner	719e615	2009-02-18 19:21:10 +0000	[diff] [blame]	1407
Chris Lattner	719e615	2009-02-18 19:21:10 +0000	[diff] [blame]	1408	const char *SpellingStart = SpellingPtr;
				1409	const char *SpellingEnd = SpellingPtr+TokLen;
				1410
Richard Smith	df9ef1b	2012-06-13 05:37:23 +0000	[diff] [blame]	1411	// Handle UTF-8 strings just like narrow strings.
				1412	if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
				1413	SpellingPtr += 2;
				1414
				1415	assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
				1416	SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
				1417
				1418	// For raw string literals, this is easy.
				1419	if (SpellingPtr[0] == 'R') {
				1420	assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
				1421	// Skip 'R"'.
				1422	SpellingPtr += 2;
				1423	while (*SpellingPtr != '(') {
				1424	++SpellingPtr;
				1425	assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
				1426	}
				1427	// Skip '('.
				1428	++SpellingPtr;
				1429	return SpellingPtr - SpellingStart + ByteNo;
				1430	}
				1431
				1432	// Skip over the leading quote
Chris Lattner	719e615	2009-02-18 19:21:10 +0000	[diff] [blame]	1433	assert(SpellingPtr[0] == '"' && "Should be a string literal!");
				1434	++SpellingPtr;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1435
Chris Lattner	719e615	2009-02-18 19:21:10 +0000	[diff] [blame]	1436	// Skip over bytes until we find the offset we're looking for.
				1437	while (ByteNo) {
				1438	assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1439
Chris Lattner	719e615	2009-02-18 19:21:10 +0000	[diff] [blame]	1440	// Step over non-escapes simply.
				1441	if (*SpellingPtr != '\\') {
				1442	++SpellingPtr;
				1443	--ByteNo;
				1444	continue;
				1445	}
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1446
Chris Lattner	719e615	2009-02-18 19:21:10 +0000	[diff] [blame]	1447	// Otherwise, this is an escape character. Advance over it.
				1448	bool HadError = false;
Richard Smith	df9ef1b	2012-06-13 05:37:23 +0000	[diff] [blame]	1449	if (SpellingPtr[1] == 'u' \|\| SpellingPtr[1] == 'U') {
				1450	const char *EscapePtr = SpellingPtr;
				1451	unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
				1452	1, Features, HadError);
				1453	if (Len > ByteNo) {
				1454	// ByteNo is somewhere within the escape sequence.
				1455	SpellingPtr = EscapePtr;
				1456	break;
				1457	}
				1458	ByteNo -= Len;
				1459	} else {
				1460	ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
				1461	FullSourceLoc(Tok.getLocation(), SM),
				1462	CharByteWidth*8, Diags);
				1463	--ByteNo;
				1464	}
Chris Lattner	719e615	2009-02-18 19:21:10 +0000	[diff] [blame]	1465	assert(!HadError && "This method isn't valid on erroneous strings");
Chris Lattner	719e615	2009-02-18 19:21:10 +0000	[diff] [blame]	1466	}
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	1467
Chris Lattner	719e615	2009-02-18 19:21:10 +0000	[diff] [blame]	1468	return SpellingPtr-SpellingStart;
				1469	}