Blame - lib/Lex/LiteralSupport.cpp - fp2-dev/platform/external/clang

2007-07-11 17:01:13 +0000

[diff] [blame]

1

//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//

2

//

3

// The LLVM Compiler Infrastructure

4

//

Chris Lattner

0bc735f

2007-12-29 19:59:25 +0000

[diff] [blame]

5

// This file is distributed under the University of Illinois Open Source

6

// License. See LICENSE.TXT for details.

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

7

//

8

//===----------------------------------------------------------------------===//

9

//

10

// This file implements the NumericLiteralParser, CharLiteralParser, and

11

// StringLiteralParser interfaces.

12

//

13

//===----------------------------------------------------------------------===//

14

15

#include "clang/Lex/LiteralSupport.h"

16

#include "clang/Lex/Preprocessor.h"

Chris Lattner

500d329

2009-01-29 05:15:15 +0000

[diff] [blame]

17

#include "clang/Lex/LexDiagnostic.h"

Chris Lattner

136f93a

2007-07-16 06:55:01 +0000

[diff] [blame]

18

#include "clang/Basic/TargetInfo.h"

Eli Friedman

f74a458

2011-11-01 02:14:50 +0000

[diff] [blame]

19

#include "clang/Basic/ConvertUTF.h"

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

20

#include "llvm/ADT/StringExtras.h"

David Blaikie

9fe8c74

2011-09-23 05:35:21 +0000

[diff] [blame]

21

#include "llvm/Support/ErrorHandling.h"

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

22

using namespace clang;

23

24

/// HexDigitValue - Return the value of the specified hex digit, or -1 if it's

25

/// not valid.

26

static int HexDigitValue(char C) {

27

if (C >= '0' && C <= '9') return C-'0';

28

if (C >= 'a' && C <= 'f') return C-'a'+10;

29

if (C >= 'A' && C <= 'F') return C-'A'+10;

return -1;

}

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

33

static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {

34

switch (kind) {

David Blaikie

b219cfc

2011-09-23 05:06:16 +0000

[diff] [blame]

35

default: llvm_unreachable("Unknown token type!");

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

36

case tok::char_constant:

37

case tok::string_literal:

38

case tok::utf8_string_literal:

39

return Target.getCharWidth();

40

case tok::wide_char_constant:

41

case tok::wide_string_literal:

42

return Target.getWCharWidth();

43

case tok::utf16_char_constant:

44

case tok::utf16_string_literal:

45

return Target.getChar16Width();

46

case tok::utf32_char_constant:

47

case tok::utf32_string_literal:

48

return Target.getChar32Width();

}

}

Seth Cantrell

2012-10-28 18:24:46 +0000

[diff] [blame]

52

static CharSourceRange MakeCharSourceRange(const LangOptions &Features,

53

FullSourceLoc TokLoc,

54

const char *TokBegin,

55

const char *TokRangeBegin,

56

const char *TokRangeEnd) {

57

SourceLocation Begin =

58

Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,

59

TokLoc.getManager(), Features);

60

SourceLocation End =

61

Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,

62

TokLoc.getManager(), Features);

63

return CharSourceRange::getCharRange(Begin, End);

64

}

65

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

66

/// \brief Produce a diagnostic highlighting some portion of a literal.

67

///

68

/// Emits the diagnostic \p DiagID, highlighting the range of characters from

69

/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be

70

/// a substring of a spelling buffer for the token beginning at \p TokBegin.

71

static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,

72

const LangOptions &Features, FullSourceLoc TokLoc,

73

const char *TokBegin, const char *TokRangeBegin,

74

const char *TokRangeEnd, unsigned DiagID) {

75

SourceLocation Begin =

76

Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,

77

TokLoc.getManager(), Features);

Seth Cantrell

2012-10-28 18:24:46 +0000

[diff] [blame]

78

return Diags->Report(Begin, DiagID) <<

79

MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

80

}

81

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

82

/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in

83

/// either a character or a string literal.

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

84

static unsigned ProcessCharEscape(const char *ThisTokBegin,

85

const char *&ThisTokBuf,

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

86

const char *ThisTokEnd, bool &HadError,

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

87

FullSourceLoc Loc, unsigned CharWidth,

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

88

DiagnosticsEngine *Diags,

89

const LangOptions &Features) {

90

const char *EscapeBegin = ThisTokBuf;

91

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

92

// Skip the '\' char.

93

++ThisTokBuf;

94

95

// We know that this character can't be off the end of the buffer, because

96

// that would have been \", which would not have been the end of string.

97

unsigned ResultChar = *ThisTokBuf++;

98

switch (ResultChar) {

99

// These map to themselves.

100

case '\\': case '\'': case '"': case '?': break;

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

101

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

102

// These have fixed mappings.

103

case 'a':

104

// TODO: K&R: the meaning of '\\a' is different in traditional C

ResultChar = 7;

break;

case 'b':

ResultChar = 8;

break;

case 'e':

Chris Lattner

2010-11-17 06:26:08 +0000

[diff] [blame]

111

if (Diags)

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

112

Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,

113

diag::ext_nonstandard_escape) << "e";

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

114

ResultChar = 27;

115

break;

Eli Friedman

3c54801

2009-06-10 01:32:39 +0000

[diff] [blame]

116

case 'E':

Chris Lattner

2010-11-17 06:26:08 +0000

[diff] [blame]

117

if (Diags)

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

118

Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,

119

diag::ext_nonstandard_escape) << "E";

Eli Friedman

3c54801

2009-06-10 01:32:39 +0000

[diff] [blame]

120

ResultChar = 27;

121

break;

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

case 'f':

ResultChar = 12;

break;

case 'n':

ResultChar = 10;

break;

case 'r':

ResultChar = 13;

break;

case 't':

ResultChar = 9;

break;

case 'v':

ResultChar = 11;

break;

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

137

case 'x': { // Hex escape.

138

ResultChar = 0;

139

if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {

Chris Lattner

2010-11-17 06:26:08 +0000

[diff] [blame]

140

if (Diags)

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

141

Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,

142

diag::err_hex_escape_no_digits);

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

143

HadError = 1;

144

break;

145

}

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

146

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

147

// Hex escapes are a maximal series of hex digits.

148

bool Overflow = false;

149

for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {

150

int CharVal = HexDigitValue(ThisTokBuf[0]);

151

if (CharVal == -1) break;

Chris Lattner

c29bbde

2008-09-30 20:45:40 +0000

[diff] [blame]

152

// About to shift out a digit?

153

Overflow |= (ResultChar & 0xF0000000) ? true : false;

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

154

ResultChar <<= 4;

155

ResultChar |= CharVal;

156

}

157

158

// See if any bits will be truncated when evaluated as a character.

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

159

if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {

160

Overflow = true;

161

ResultChar &= ~0U >> (32-CharWidth);

162

}

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

163

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

164

// Check for overflow.

Chris Lattner

2010-11-17 06:26:08 +0000

[diff] [blame]

165

if (Overflow && Diags) // Too many digits to fit in

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

166

Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,

167

diag::warn_hex_escape_too_large);

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

168

break;

169

}

170

case '0': case '1': case '2': case '3':

171

case '4': case '5': case '6': case '7': {

// Octal escapes.

--ThisTokBuf;

ResultChar = 0;

// Octal escapes are a series of octal digits with maximum length 3.

177

// "\0123" is a two digit sequence equal to "\012" "3".

178

unsigned NumDigits = 0;

179

do {

180

ResultChar <<= 3;

181

ResultChar |= *ThisTokBuf++ - '0';

182

++NumDigits;

183

} while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&

184

ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

185

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

186

// Check for overflow. Reject '\777', but not L'\777'.

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

187

if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {

Chris Lattner

2010-11-17 06:26:08 +0000

[diff] [blame]

188

if (Diags)

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

189

Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,

190

diag::warn_octal_escape_too_large);

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

191

ResultChar &= ~0U >> (32-CharWidth);

192

}

193

break;

194

}

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

195

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

196

// Otherwise, these are not valid escapes.

197

case '(': case '{': case '[': case '%':

198

// GCC accepts these as extensions. We warn about them as such though.

Chris Lattner

2010-11-17 06:26:08 +0000

[diff] [blame]

199

if (Diags)

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

200

Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,

201

diag::ext_nonstandard_escape)

202

<< std::string(1, ResultChar);

Eli Friedman

f01fdff

2009-04-28 00:51:18 +0000

[diff] [blame]

203

break;

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

204

default:

Chris Lattner

2010-11-17 06:26:08 +0000

[diff] [blame]

205

if (Diags == 0)

Douglas Gregor

b90f4b3

2010-05-26 05:35:51 +0000

[diff] [blame]

206

break;

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

207

Ted Kremenek

23ef69d

2010-12-03 00:09:56 +0000

[diff] [blame]

208

if (isgraph(ResultChar))

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

209

Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,

210

diag::ext_unknown_escape)

211

<< std::string(1, ResultChar);

Chris Lattner

2008-11-22 07:23:31 +0000

[diff] [blame]

212

else

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

213

Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,

214

diag::ext_unknown_escape)

215

<< "x" + llvm::utohexstr(ResultChar);

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

216

break;

217

}

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

218

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

return ResultChar;

}

Steve Naroff

2009-03-30 23:46:03 +0000

[diff] [blame]

222

/// ProcessUCNEscape - Read the Universal Character Name, check constraints and

Nico Weber

2010-10-09 00:27:47 +0000

[diff] [blame]

223

/// return the UTF32.

Richard Smith

2012-03-09 22:27:51 +0000

[diff] [blame]

224

static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,

225

const char *ThisTokEnd,

Nico Weber

2010-10-09 00:27:47 +0000

[diff] [blame]

226

uint32_t &UcnVal, unsigned short &UcnLen,

David Blaikie

d6471f7

2011-09-25 23:23:43 +0000

[diff] [blame]

227

FullSourceLoc Loc, DiagnosticsEngine *Diags,

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

228

const LangOptions &Features,

229

bool in_char_string_literal = false) {

Richard Smith

2012-03-09 22:27:51 +0000

[diff] [blame]

230

const char *UcnBegin = ThisTokBuf;

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

231

Steve Naroff

2009-03-30 23:46:03 +0000

[diff] [blame]

232

// Skip the '\u' char's.

233

ThisTokBuf += 2;

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

234

Steve Naroff

2009-03-30 23:46:03 +0000

[diff] [blame]

235

if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {

Chris Lattner

6c66f07

2010-11-17 06:46:14 +0000

[diff] [blame]

236

if (Diags)

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

237

Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,

238

diag::err_ucn_escape_no_digits);

Nico Weber

2010-10-09 00:27:47 +0000

[diff] [blame]

239

return false;

Steve Naroff

2009-03-30 23:46:03 +0000

[diff] [blame]

240

}

Nico Weber

2010-10-09 00:27:47 +0000

[diff] [blame]

241

UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);

Fariborz Jahanian

56bedef

2010-08-31 23:34:27 +0000

[diff] [blame]

242

unsigned short UcnLenSave = UcnLen;

Nico Weber

2010-10-09 00:27:47 +0000

[diff] [blame]

243

for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {

Steve Naroff

2009-03-30 23:46:03 +0000

[diff] [blame]

244

int CharVal = HexDigitValue(ThisTokBuf[0]);

245

if (CharVal == -1) break;

UcnVal <<= 4;

UcnVal |= CharVal;

}

// If we didn't consume the proper number of digits, there is a problem.

Nico Weber

2010-10-09 00:27:47 +0000

[diff] [blame]

250

if (UcnLenSave) {

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

251

if (Diags)

252

Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,

253

diag::err_ucn_escape_incomplete);

Nico Weber

2010-10-09 00:27:47 +0000

[diff] [blame]

254

return false;

Steve Naroff

2009-03-30 23:46:03 +0000

[diff] [blame]

255

}

Richard Smith

2012-03-09 22:27:51 +0000

[diff] [blame]

256

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

257

// Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]

Richard Smith

2012-03-09 22:27:51 +0000

[diff] [blame]

258

if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints

259

UcnVal > 0x10FFFF) { // maximum legal UTF32 value

Chris Lattner

6c66f07

2010-11-17 06:46:14 +0000

[diff] [blame]

260

if (Diags)

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

261

Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,

262

diag::err_ucn_escape_invalid);

Nico Weber

2010-10-09 00:27:47 +0000

[diff] [blame]

263

return false;

264

}

Richard Smith

2012-03-09 22:27:51 +0000

[diff] [blame]

265

266

// C++11 allows UCNs that refer to control characters and basic source

267

// characters inside character and string literals

268

if (UcnVal < 0xa0 &&

269

(UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { // $, @, `

270

bool IsError = (!Features.CPlusPlus0x || !in_char_string_literal);

271

if (Diags) {

Richard Smith

2012-03-09 22:27:51 +0000

[diff] [blame]

272

char BasicSCSChar = UcnVal;

273

if (UcnVal >= 0x20 && UcnVal < 0x7f)

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

274

Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,

275

IsError ? diag::err_ucn_escape_basic_scs :

276

diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)

277

<< StringRef(&BasicSCSChar, 1);

Richard Smith

2012-03-09 22:27:51 +0000

[diff] [blame]

278

else

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

279

Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,

280

IsError ? diag::err_ucn_control_character :

281

diag::warn_cxx98_compat_literal_ucn_control_character);

Richard Smith

2012-03-09 22:27:51 +0000

[diff] [blame]

}

if (IsError)

return false;

}

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

287

if (!Features.CPlusPlus && !Features.C99 && Diags)

288

Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,

289

diag::warn_ucn_not_valid_in_c89);

290

Nico Weber

2010-10-09 00:27:47 +0000

[diff] [blame]

return true;

}

Richard Smith

2012-06-13 05:37:23 +0000

[diff] [blame]

294

/// MeasureUCNEscape - Determine the number of bytes within the resulting string

295

/// which this UCN will occupy.

296

static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,

297

const char *ThisTokEnd, unsigned CharByteWidth,

298

const LangOptions &Features, bool &HadError) {

299

// UTF-32: 4 bytes per escape.

300

if (CharByteWidth == 4)

return 4;

uint32_t UcnVal = 0;

unsigned short UcnLen = 0;

305

FullSourceLoc Loc;

306

307

if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,

308

UcnLen, Loc, 0, Features, true)) {

HadError = true;

return 0;

}

// UTF-16: 2 bytes for BMP, 4 bytes otherwise.

314

if (CharByteWidth == 2)

315

return UcnVal <= 0xFFFF ? 2 : 4;

// UTF-8.

if (UcnVal < 0x80)

return 1;

if (UcnVal < 0x800)

return 2;

if (UcnVal < 0x10000)

return 3;

return 4;

}

Nico Weber

2010-10-09 00:27:47 +0000

[diff] [blame]

327

/// EncodeUCNEscape - Read the Universal Character Name, check constraints and

328

/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of

329

/// StringLiteralParser. When we decide to implement UCN's for identifiers,

330

/// we will likely rework our support for UCN's.

Richard Smith

2012-03-09 22:27:51 +0000

[diff] [blame]

331

static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,

332

const char *ThisTokEnd,

Chris Lattner

a95880d

2010-11-17 07:12:42 +0000

[diff] [blame]

333

char *&ResultBuf, bool &HadError,

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

334

FullSourceLoc Loc, unsigned CharByteWidth,

David Blaikie

d6471f7

2011-09-25 23:23:43 +0000

[diff] [blame]

335

DiagnosticsEngine *Diags,

336

const LangOptions &Features) {

Nico Weber

2010-10-09 00:27:47 +0000

[diff] [blame]

337

typedef uint32_t UTF32;

338

UTF32 UcnVal = 0;

339

unsigned short UcnLen = 0;

Richard Smith

2012-03-09 22:27:51 +0000

[diff] [blame]

340

if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,

341

Loc, Diags, Features, true)) {

Richard Smith

2012-06-13 05:37:23 +0000

[diff] [blame]

342

HadError = true;

Steve Naroff

2009-03-30 23:46:03 +0000

[diff] [blame]

343

return;

344

}

Nico Weber

2010-10-09 00:27:47 +0000

[diff] [blame]

345

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

346

assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth) &&

347

"only character widths of 1, 2, or 4 bytes supported");

Nico Weber

2010-10-06 04:57:26 +0000

[diff] [blame]

348

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

349

(void)UcnLen;

350

assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");

Nico Weber

2010-10-06 04:57:26 +0000

[diff] [blame]

351

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

352

if (CharByteWidth == 4) {

Eli Friedman

2011-11-02 23:06:23 +0000

[diff] [blame]

353

// FIXME: Make the type of the result buffer correct instead of

354

// using reinterpret_cast.

355

UTF32 *ResultPtr = reinterpret_cast<UTF32*>(ResultBuf);

356

*ResultPtr = UcnVal;

357

ResultBuf += 4;

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

return;

}

if (CharByteWidth == 2) {

Eli Friedman

2011-11-02 23:06:23 +0000

[diff] [blame]

362

// FIXME: Make the type of the result buffer correct instead of

363

// using reinterpret_cast.

364

UTF16 *ResultPtr = reinterpret_cast<UTF16*>(ResultBuf);

365

Richard Smith

59b26d8

2012-06-13 05:41:29 +0000

[diff] [blame]

366

if (UcnVal <= (UTF32)0xFFFF) {

Eli Friedman

2011-11-02 23:06:23 +0000

[diff] [blame]

367

*ResultPtr = UcnVal;

368

ResultBuf += 2;

Nico Weber

2010-10-06 04:57:26 +0000

[diff] [blame]

369

return;

370

}

Nico Weber

2010-10-06 04:57:26 +0000

[diff] [blame]

371

Eli Friedman

2011-11-02 23:06:23 +0000

[diff] [blame]

372

// Convert to UTF16.

Nico Weber

2010-10-06 04:57:26 +0000

[diff] [blame]

373

UcnVal -= 0x10000;

Eli Friedman

2011-11-02 23:06:23 +0000

[diff] [blame]

374

*ResultPtr = 0xD800 + (UcnVal >> 10);

375

*(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);

376

ResultBuf += 4;

Fariborz Jahanian

56bedef

2010-08-31 23:34:27 +0000

[diff] [blame]

377

return;

378

}

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

379

380

assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");

381

Steve Naroff

2009-03-30 23:46:03 +0000

[diff] [blame]

382

// Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.

383

// The conversion below was inspired by:

384

// http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

385

// First, we determine how many bytes the result will require.

Steve Naroff

4e93b34

2009-04-01 11:09:15 +0000

[diff] [blame]

386

typedef uint8_t UTF8;

Steve Naroff

2009-03-30 23:46:03 +0000

[diff] [blame]

387

388

unsigned short bytesToWrite = 0;

389

if (UcnVal < (UTF32)0x80)

390

bytesToWrite = 1;

391

else if (UcnVal < (UTF32)0x800)

392

bytesToWrite = 2;

393

else if (UcnVal < (UTF32)0x10000)

394

bytesToWrite = 3;

395

else

396

bytesToWrite = 4;

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

397

Steve Naroff

2009-03-30 23:46:03 +0000

[diff] [blame]

398

const unsigned byteMask = 0xBF;

399

const unsigned byteMark = 0x80;

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

400

Steve Naroff

2009-03-30 23:46:03 +0000

[diff] [blame]

401

// Once the bits are split out into bytes of UTF8, this is a mask OR-ed

Steve Naroff

8a5c0cd

2009-03-31 10:29:45 +0000

[diff] [blame]

402

// into the first byte, depending on how many bytes follow.

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

403

static const UTF8 firstByteMark[5] = {

Steve Naroff

8a5c0cd

2009-03-31 10:29:45 +0000

[diff] [blame]

404

0x00, 0x00, 0xC0, 0xE0, 0xF0

Steve Naroff

2009-03-30 23:46:03 +0000

[diff] [blame]

405

};

406

// Finally, we write the bytes into ResultBuf.

407

ResultBuf += bytesToWrite;

408

switch (bytesToWrite) { // note: everything falls through.

409

case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;

410

case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;

411

case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;

412

case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);

413

}

414

// Update the buffer.

415

ResultBuf += bytesToWrite;

416

}

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

417

418

419

/// integer-constant: [C99 6.4.4.1]

420

/// decimal-constant integer-suffix

421

/// octal-constant integer-suffix

422

/// hexadecimal-constant integer-suffix

Richard Smith

49d5174

2012-03-08 21:59:28 +0000

[diff] [blame]

423

/// user-defined-integer-literal: [C++11 lex.ext]

Richard Smith

2012-03-08 08:45:32 +0000

[diff] [blame]

424

/// decimal-literal ud-suffix

425

/// octal-literal ud-suffix

426

/// hexadecimal-literal ud-suffix

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

427

/// decimal-constant:

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

428

/// nonzero-digit

429

/// decimal-constant digit

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

430

/// octal-constant:

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

431

/// 0

432

/// octal-constant octal-digit

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

433

/// hexadecimal-constant:

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

434

/// hexadecimal-prefix hexadecimal-digit

435

/// hexadecimal-constant hexadecimal-digit

436

/// hexadecimal-prefix: one of

437

/// 0x 0X

438

/// integer-suffix:

439

/// unsigned-suffix [long-suffix]

440

/// unsigned-suffix [long-long-suffix]

441

/// long-suffix [unsigned-suffix]

442

/// long-long-suffix [unsigned-sufix]

443

/// nonzero-digit:

444

/// 1 2 3 4 5 6 7 8 9

445

/// octal-digit:

446

/// 0 1 2 3 4 5 6 7

447

/// hexadecimal-digit:

448

/// 0 1 2 3 4 5 6 7 8 9

449

/// a b c d e f

450

/// A B C D E F

451

/// unsigned-suffix: one of

452

/// u U

453

/// long-suffix: one of

454

/// l L

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

455

/// long-long-suffix: one of

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

456

/// ll LL

457

///

458

/// floating-constant: [C99 6.4.4.2]

459

/// TODO: add rules...

460

///

Dmitri Gribenko

2012-09-24 09:53:54 +0000

[diff] [blame]

461

NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,

462

SourceLocation TokLoc,

463

Preprocessor &PP)

464

: PP(PP), ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

465

Chris Lattner

c29bbde

2008-09-30 20:45:40 +0000

[diff] [blame]

466

// This routine assumes that the range begin/end matches the regex for integer

467

// and FP constants (specifically, the 'pp-number' regex), and assumes that

468

// the byte at "*end" is both valid and not part of the regex. Because of

469

// this, it doesn't have to check for 'overscan' in various places.

Dmitri Gribenko

2012-09-24 09:53:54 +0000

[diff] [blame]

470

assert(!isalnum(*ThisTokEnd) && *ThisTokEnd != '.' && *ThisTokEnd != '_' &&

Chris Lattner

c29bbde

2008-09-30 20:45:40 +0000

[diff] [blame]

471

"Lexer didn't maximally munch?");

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

472

Dmitri Gribenko

2012-09-24 09:53:54 +0000

[diff] [blame]

473

s = DigitsBegin = ThisTokBegin;

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

474

saw_exponent = false;

475

saw_period = false;

Richard Smith

2012-03-08 08:45:32 +0000

[diff] [blame]

476

saw_ud_suffix = false;

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

477

isLong = false;

478

isUnsigned = false;

479

isLongLong = false;

Chris Lattner

6e400c2

2007-08-26 03:29:23 +0000

[diff] [blame]

480

isFloat = false;

Chris Lattner

2007-08-26 01:58:14 +0000

[diff] [blame]

481

isImaginary = false;

Mike Stump

2009-10-08 22:55:36 +0000

[diff] [blame]

482

isMicrosoftInteger = false;

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

483

hadError = false;

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

484

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

485

if (*s == '0') { // parse radix

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

486

ParseNumberStartingWithZero(TokLoc);

487

if (hadError)

488

return;

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

489

} else { // the first digit is non-zero

490

radix = 10;

491

s = SkipDigits(s);

492

if (s == ThisTokEnd) {

493

// Done.

Christopher Lamb

016765e

2007-11-29 06:06:27 +0000

[diff] [blame]

494

} else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) {

Dmitri Gribenko

2012-09-24 09:53:54 +0000

[diff] [blame]

495

PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),

Chris Lattner

2011-07-23 10:55:15 +0000

[diff] [blame]

496

diag::err_invalid_decimal_digit) << StringRef(s, 1);

Chris Lattner

2008-11-22 07:23:31 +0000

[diff] [blame]

497

hadError = true;

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

498

return;

499

} else if (*s == '.') {

500

s++;

501

saw_period = true;

502

s = SkipDigits(s);

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

503

}

Chris Lattner

4411f46

2008-09-29 23:12:31 +0000

[diff] [blame]

504

if ((*s == 'e' || *s == 'E')) { // exponent

Chris Lattner

70f66ab

2008-04-20 18:47:55 +0000

[diff] [blame]

505

const char *Exponent = s;

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

506

s++;

507

saw_exponent = true;

508

if (*s == '+' || *s == '-') s++; // sign

509

const char *first_non_digit = SkipDigits(s);

Chris Lattner

0b7f69d

2008-04-20 18:41:46 +0000

[diff] [blame]

510

if (first_non_digit != s) {

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

511

s = first_non_digit;

Chris Lattner

0b7f69d

2008-04-20 18:41:46 +0000

[diff] [blame]

512

} else {

Dmitri Gribenko

2012-09-24 09:53:54 +0000

[diff] [blame]

513

PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent - ThisTokBegin),

Chris Lattner

2008-11-22 07:23:31 +0000

[diff] [blame]

514

diag::err_exponent_has_no_digits);

515

hadError = true;

Chris Lattner

0b7f69d

2008-04-20 18:41:46 +0000

[diff] [blame]

516

return;

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

}

}

}

SuffixBegin = s;

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

522

Chris Lattner

2007-08-26 01:58:14 +0000

[diff] [blame]

523

// Parse the suffix. At this point we can classify whether we have an FP or

524

// integer constant.

525

bool isFPConstant = isFloatingLiteral();

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

526

Chris Lattner

2007-08-26 01:58:14 +0000

[diff] [blame]

527

// Loop over all of the characters of the suffix. If we see something bad,

528

// we break out of the loop.

529

for (; s != ThisTokEnd; ++s) {

530

switch (*s) {

531

case 'f': // FP Suffix for "float"

532

case 'F':

533

if (!isFPConstant) break; // Error for integer constant.

Chris Lattner

6e400c2

2007-08-26 03:29:23 +0000

[diff] [blame]

534

if (isFloat || isLong) break; // FF, LF invalid.

535

isFloat = true;

Chris Lattner

2007-08-26 01:58:14 +0000

[diff] [blame]

536

continue; // Success.

537

case 'u':

538

case 'U':

539

if (isFPConstant) break; // Error for floating constant.

540

if (isUnsigned) break; // Cannot be repeated.

541

isUnsigned = true;

542

continue; // Success.

543

case 'l':

544

case 'L':

545

if (isLong || isLongLong) break; // Cannot be repeated.

Chris Lattner

6e400c2

2007-08-26 03:29:23 +0000

[diff] [blame]

546

if (isFloat) break; // LF invalid.

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

547

Chris Lattner

2007-08-26 01:58:14 +0000

[diff] [blame]

548

// Check for long long. The L's need to be adjacent and the same case.

549

if (s+1 != ThisTokEnd && s[1] == s[0]) {

550

if (isFPConstant) break; // long long invalid for floats.

551

isLongLong = true;

552

++s; // Eat both of them.

553

} else {

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

554

isLong = true;

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

555

}

Chris Lattner

2007-08-26 01:58:14 +0000

[diff] [blame]

556

continue; // Success.

557

case 'i':

Chris Lattner

c637415

2010-10-14 00:24:10 +0000

[diff] [blame]

558

case 'I':

David Blaikie

2012-03-11 07:00:24 +0000

[diff] [blame]

559

if (PP.getLangOpts().MicrosoftExt) {

Fariborz Jahanian

a8be02b

2010-01-22 21:36:53 +0000

[diff] [blame]

560

if (isFPConstant || isLong || isLongLong) break;

Nuno Lopes

2009-11-28 13:37:52 +0000

[diff] [blame]

561

Steve Naroff

0c29b22

2008-04-04 21:02:54 +0000

[diff] [blame]

562

// Allow i8, i16, i32, i64, and i128.

Mike Stump

2009-10-08 22:55:36 +0000

[diff] [blame]

563

if (s + 1 != ThisTokEnd) {

switch (s[1]) {

case '8':

s += 2; // i8 suffix

isMicrosoftInteger = true;

Nuno Lopes

2009-11-28 13:37:52 +0000

[diff] [blame]

568

break;

Mike Stump

2009-10-08 22:55:36 +0000

[diff] [blame]

569

case '1':

Nuno Lopes

2009-11-28 13:37:52 +0000

[diff] [blame]

570

if (s + 2 == ThisTokEnd) break;

Francois Pichet

2011-01-11 11:57:53 +0000

[diff] [blame]

571

if (s[2] == '6') {

572

s += 3; // i16 suffix

573

isMicrosoftInteger = true;

574

}

Nuno Lopes

2009-11-28 13:37:52 +0000

[diff] [blame]

575

else if (s[2] == '2') {

576

if (s + 3 == ThisTokEnd) break;

Francois Pichet

2011-01-11 11:57:53 +0000

[diff] [blame]

577

if (s[3] == '8') {

578

s += 4; // i128 suffix

579

isMicrosoftInteger = true;

580

}

Mike Stump

2009-10-08 22:55:36 +0000

[diff] [blame]

581

}

Nuno Lopes

2009-11-28 13:37:52 +0000

[diff] [blame]

582

break;

Mike Stump

2009-10-08 22:55:36 +0000

[diff] [blame]

583

case '3':

Nuno Lopes

2009-11-28 13:37:52 +0000

[diff] [blame]

584

if (s + 2 == ThisTokEnd) break;

Francois Pichet

2011-01-11 11:57:53 +0000

[diff] [blame]

585

if (s[2] == '2') {

586

s += 3; // i32 suffix

587

isLong = true;

588

isMicrosoftInteger = true;

589

}

Nuno Lopes

2009-11-28 13:37:52 +0000

[diff] [blame]

590

break;

Mike Stump

2009-10-08 22:55:36 +0000

[diff] [blame]

591

case '6':

Nuno Lopes

2009-11-28 13:37:52 +0000

[diff] [blame]

592

if (s + 2 == ThisTokEnd) break;

Francois Pichet

2011-01-11 11:57:53 +0000

[diff] [blame]

593

if (s[2] == '4') {

594

s += 3; // i64 suffix

595

isLongLong = true;

596

isMicrosoftInteger = true;

597

}

Nuno Lopes

2009-11-28 13:37:52 +0000

[diff] [blame]

598

break;

Mike Stump

2009-10-08 22:55:36 +0000

[diff] [blame]

default:

break;

}

break;

Steve Naroff

0c29b22

2008-04-04 21:02:54 +0000

[diff] [blame]

603

}

Steve Naroff

0c29b22

2008-04-04 21:02:54 +0000

[diff] [blame]

604

}

605

// fall through.

Chris Lattner

2007-08-26 01:58:14 +0000

[diff] [blame]

606

case 'j':

607

case 'J':

608

if (isImaginary) break; // Cannot be repeated.

Dmitri Gribenko

2012-09-24 09:53:54 +0000

[diff] [blame]

609

PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),

Chris Lattner

2007-08-26 01:58:14 +0000

[diff] [blame]

610

diag::ext_imaginary_constant);

611

isImaginary = true;

612

continue; // Success.

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

613

}

Richard Smith

2012-03-08 08:45:32 +0000

[diff] [blame]

614

// If we reached here, there was an error or a ud-suffix.

Chris Lattner

2007-08-26 01:58:14 +0000

[diff] [blame]

615

break;

616

}

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

617

Chris Lattner

2007-08-26 01:58:14 +0000

[diff] [blame]

618

if (s != ThisTokEnd) {

David Blaikie

2012-03-11 07:00:24 +0000

[diff] [blame]

619

if (PP.getLangOpts().CPlusPlus0x && s == SuffixBegin && *s == '_') {

Richard Smith

2012-03-08 08:45:32 +0000

[diff] [blame]

620

// We have a ud-suffix! By C++11 [lex.ext]p10, ud-suffixes not starting

621

// with an '_' are ill-formed.

622

saw_ud_suffix = true;

return;

}

// Report an error if there are any.

Dmitri Gribenko

2012-09-24 09:53:54 +0000

[diff] [blame]

627

PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin - ThisTokBegin),

Chris Lattner

2008-11-22 07:23:31 +0000

[diff] [blame]

628

isFPConstant ? diag::err_invalid_suffix_float_constant :

629

diag::err_invalid_suffix_integer_constant)

Chris Lattner

2011-07-23 10:55:15 +0000

[diff] [blame]

630

<< StringRef(SuffixBegin, ThisTokEnd-SuffixBegin);

Chris Lattner

2008-11-22 07:23:31 +0000

[diff] [blame]

631

hadError = true;

Chris Lattner

2007-08-26 01:58:14 +0000

[diff] [blame]

632

return;

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

}

}

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

636

/// ParseNumberStartingWithZero - This method is called when the first character

637

/// of the number is found to be a zero. This means it is either an octal

638

/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

639

/// a floating point number (01239.123e4). Eat the prefix, determining the

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

640

/// radix etc.

641

void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {

642

assert(s[0] == '0' && "Invalid method call");

643

s++;

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

644

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

645

// Handle a hex number like 0x1234.

646

if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) {

s++;

radix = 16;

DigitsBegin = s;

s = SkipHexDigits(s);

Aaron Ballman

2012-02-08 13:36:33 +0000

[diff] [blame]

651

bool noSignificand = (s == DigitsBegin);

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

652

if (s == ThisTokEnd) {

653

// Done.

654

} else if (*s == '.') {

655

s++;

656

saw_period = true;

Aaron Ballman

2012-02-08 13:36:33 +0000

[diff] [blame]

657

const char *floatDigitsBegin = s;

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

658

s = SkipHexDigits(s);

Aaron Ballman

2012-02-08 13:36:33 +0000

[diff] [blame]

659

noSignificand &= (floatDigitsBegin == s);

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

660

}

Aaron Ballman

2012-02-08 13:36:33 +0000

[diff] [blame]

661

662

if (noSignificand) {

Dmitri Gribenko

2012-09-24 09:53:54 +0000

[diff] [blame]

663

PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),

Aaron Ballman

2012-02-08 13:36:33 +0000

[diff] [blame]

664

diag::err_hexconstant_requires_digits);

hadError = true;

return;

}

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

669

// A binary exponent can appear with or with a '.'. If dotted, the

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

670

// binary exponent is required.

Douglas Gregor

1155c42

2011-08-30 22:40:35 +0000

[diff] [blame]

671

if (*s == 'p' || *s == 'P') {

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

672

const char *Exponent = s;

673

s++;

674

saw_exponent = true;

675

if (*s == '+' || *s == '-') s++; // sign

676

const char *first_non_digit = SkipDigits(s);

Chris Lattner

6ea6238

2008-07-25 18:18:34 +0000

[diff] [blame]

677

if (first_non_digit == s) {

Chris Lattner

2008-11-22 07:23:31 +0000

[diff] [blame]

678

PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),

679

diag::err_exponent_has_no_digits);

680

hadError = true;

Chris Lattner

6ea6238

2008-07-25 18:18:34 +0000

[diff] [blame]

681

return;

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

682

}

Chris Lattner

6ea6238

2008-07-25 18:18:34 +0000

[diff] [blame]

683

s = first_non_digit;

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

684

David Blaikie

2012-03-11 07:00:24 +0000

[diff] [blame]

685

if (!PP.getLangOpts().HexFloats)

Chris Lattner

2008-11-22 07:23:31 +0000

[diff] [blame]

686

PP.Diag(TokLoc, diag::ext_hexconstant_invalid);

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

687

} else if (saw_period) {

Chris Lattner

2008-11-22 07:23:31 +0000

[diff] [blame]

688

PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),

689

diag::err_hexconstant_requires_exponent);

690

hadError = true;

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

691

}

692

return;

693

}

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

694

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

695

// Handle simple binary numbers 0b01010

696

if (*s == 'b' || *s == 'B') {

697

// 0b101010 is a GCC extension.

Chris Lattner

2008-06-30 06:44:49 +0000

[diff] [blame]

698

PP.Diag(TokLoc, diag::ext_binary_literal);

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

++s;

radix = 2;

DigitsBegin = s;

s = SkipBinaryDigits(s);

703

if (s == ThisTokEnd) {

704

// Done.

705

} else if (isxdigit(*s)) {

Chris Lattner

2008-11-22 07:23:31 +0000

[diff] [blame]

706

PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),

Chris Lattner

2011-07-23 10:55:15 +0000

[diff] [blame]

707

diag::err_invalid_binary_digit) << StringRef(s, 1);

Chris Lattner

2008-11-22 07:23:31 +0000

[diff] [blame]

708

hadError = true;

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

709

}

Chris Lattner

2008-06-30 06:44:49 +0000

[diff] [blame]

710

// Other suffixes will be diagnosed by the caller.

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

711

return;

712

}

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

713

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

714

// For now, the radix is set to 8. If we discover that we have a

715

// floating point constant, the radix will change to 10. Octal floating

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

716

// point constants are not permitted (only decimal and hexadecimal).

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

717

radix = 8;

718

DigitsBegin = s;

719

s = SkipOctalDigits(s);

720

if (s == ThisTokEnd)

721

return; // Done, simple octal number like 01234

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

722

Chris Lattner

2008-06-30 06:44:49 +0000

[diff] [blame]

723

// If we have some other non-octal digit that *is* a decimal digit, see if

724

// this is part of a floating point number like 094.123 or 09e1.

725

if (isdigit(*s)) {

726

const char *EndDecimal = SkipDigits(s);

727

if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {

s = EndDecimal;

radix = 10;

}

}

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

732

Chris Lattner

2008-06-30 06:44:49 +0000

[diff] [blame]

733

// If we have a hex digit other than 'e' (which denotes a FP exponent) then

734

// the code is using an incorrect base.

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

735

if (isxdigit(*s) && *s != 'e' && *s != 'E') {

Chris Lattner

2008-11-22 07:23:31 +0000

[diff] [blame]

736

PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),

Chris Lattner

2011-07-23 10:55:15 +0000

[diff] [blame]

737

diag::err_invalid_octal_digit) << StringRef(s, 1);

Chris Lattner

2008-11-22 07:23:31 +0000

[diff] [blame]

738

hadError = true;

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

739

return;

740

}

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

741

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

if (*s == '.') {

s++;

radix = 10;

saw_period = true;

Chris Lattner

2008-06-30 06:44:49 +0000

[diff] [blame]

746

s = SkipDigits(s); // Skip suffix.

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

747

}

748

if (*s == 'e' || *s == 'E') { // exponent

749

const char *Exponent = s;

s++;

radix = 10;

saw_exponent = true;

if (*s == '+' || *s == '-') s++; // sign

754

const char *first_non_digit = SkipDigits(s);

755

if (first_non_digit != s) {

756

s = first_non_digit;

757

} else {

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

758

PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),

Chris Lattner

2008-11-22 07:23:31 +0000

[diff] [blame]

759

diag::err_exponent_has_no_digits);

760

hadError = true;

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

return;

}

}

}

Jordan Rose

2012-09-25 22:32:51 +0000

[diff] [blame]

766

static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {

Dmitri Gribenko

2012-09-25 19:09:15 +0000

[diff] [blame]

767

switch (Radix) {

768

case 2:

769

return NumDigits <= 64;

770

case 8:

771

return NumDigits <= 64 / 3; // Digits are groups of 3 bits.

772

case 10:

773

return NumDigits <= 19; // floor(log10(2^64))

774

case 16:

775

return NumDigits <= 64 / 4; // Digits are groups of 4 bits.

776

default:

777

llvm_unreachable("impossible Radix");

778

}

779

}

Chris Lattner

2008-06-30 06:39:54 +0000

[diff] [blame]

780

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

781

/// GetIntegerValue - Convert this numeric literal value to an APInt that

782

/// matches Val's input width. If there is an overflow, set Val to the low bits

783

/// of the result and return true. Otherwise, return false.

784

bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {

Daniel Dunbar

a179be3

2008-10-16 07:32:01 +0000

[diff] [blame]

785

// Fast path: Compute a conservative bound on the maximum number of

786

// bits per digit in this radix. If we can't possibly overflow a

787

// uint64 based on that bound then do the simple conversion to

788

// integer. This avoids the expensive overflow checking below, and

789

// handles the common cases that matter (small decimal integers and

790

// hex/octal values which don't overflow).

Dmitri Gribenko

2012-09-25 19:09:15 +0000

[diff] [blame]

791

const unsigned NumDigits = SuffixBegin - DigitsBegin;

Jordan Rose

2fd6956

2012-09-25 22:32:51 +0000

[diff] [blame]

792

if (alwaysFitsInto64Bits(radix, NumDigits)) {

Daniel Dunbar

a179be3

2008-10-16 07:32:01 +0000

[diff] [blame]

793

uint64_t N = 0;

Dmitri Gribenko

2012-09-25 19:09:15 +0000

[diff] [blame]

794

for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)

795

N = N * radix + HexDigitValue(*Ptr);

Daniel Dunbar

a179be3

2008-10-16 07:32:01 +0000

[diff] [blame]

796

797

// This will truncate the value to Val's input width. Simply check

798

// for overflow by comparing.

799

Val = N;

800

return Val.getZExtValue() != N;

801

}

802

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

803

Val = 0;

Dmitri Gribenko

2012-09-25 19:09:15 +0000

[diff] [blame]

804

const char *Ptr = DigitsBegin;

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

805

806

llvm::APInt RadixVal(Val.getBitWidth(), radix);

807

llvm::APInt CharVal(Val.getBitWidth(), 0);

808

llvm::APInt OldVal = Val;

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

809

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

810

bool OverflowOccurred = false;

Dmitri Gribenko

2012-09-25 19:09:15 +0000

[diff] [blame]

811

while (Ptr < SuffixBegin) {

812

unsigned C = HexDigitValue(*Ptr++);

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

813

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

814

// If this letter is out of bound for this radix, reject it.

815

assert(C < radix && "NumericLiteralParser ctor should have rejected this");

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

816

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

817

CharVal = C;

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

818

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

819

// Add the digit to the value in the appropriate radix. If adding in digits

820

// made the value smaller, then this overflowed.

821

OldVal = Val;

822

823

// Multiply by radix, did overflow occur on the multiply?

824

Val *= RadixVal;

825

OverflowOccurred |= Val.udiv(RadixVal) != OldVal;

826

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

827

// Add value, did overflow occur on the value?

Daniel Dunbar

d70cb64

2008-10-16 06:39:30 +0000

[diff] [blame]

828

// (a + b) ult b <=> overflow

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

829

Val += CharVal;

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

830

OverflowOccurred |= Val.ult(CharVal);

831

}

832

return OverflowOccurred;

833

}

834

John McCall

94c939d

2009-12-24 09:08:04 +0000

[diff] [blame]

835

llvm::APFloat::opStatus

836

NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {

Ted Kremenek

427d5af

2007-11-26 23:12:30 +0000

[diff] [blame]

837

using llvm::APFloat;

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

838

Erick Tryzelaar

e9f195f

2009-08-16 23:36:28 +0000

[diff] [blame]

839

unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);

John McCall

94c939d

2009-12-24 09:08:04 +0000

[diff] [blame]

840

return Result.convertFromString(StringRef(ThisTokBegin, n),

841

APFloat::rmNearestTiesToEven);

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

842

}

843

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

844

James Dennett

58f9ce1

2012-06-17 03:34:42 +0000

[diff] [blame]

845

/// \verbatim

Richard Smith

2012-03-05 04:02:15 +0000

[diff] [blame]

846

/// user-defined-character-literal: [C++11 lex.ext]

847

/// character-literal ud-suffix

848

/// ud-suffix:

849

/// identifier

850

/// character-literal: [C++11 lex.ccon]

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

851

/// ' c-char-sequence '

852

/// u' c-char-sequence '

853

/// U' c-char-sequence '

854

/// L' c-char-sequence '

855

/// c-char-sequence:

856

/// c-char

857

/// c-char-sequence c-char

858

/// c-char:

859

/// any member of the source character set except the single-quote ',

860

/// backslash \, or new-line character

861

/// escape-sequence

862

/// universal-character-name

Richard Smith

2012-03-05 04:02:15 +0000

[diff] [blame]

863

/// escape-sequence:

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

864

/// simple-escape-sequence

865

/// octal-escape-sequence

866

/// hexadecimal-escape-sequence

867

/// simple-escape-sequence:

NAKAMURA Takumi

ddddd48

2011-08-12 05:49:51 +0000

[diff] [blame]

868

/// one of \' \" \? \\ \a \b \f \n \r \t \v

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

869

/// octal-escape-sequence:

870

/// \ octal-digit

871

/// \ octal-digit octal-digit

872

/// \ octal-digit octal-digit octal-digit

873

/// hexadecimal-escape-sequence:

874

/// \x hexadecimal-digit

875

/// hexadecimal-escape-sequence hexadecimal-digit

Richard Smith

2012-03-05 04:02:15 +0000

[diff] [blame]

876

/// universal-character-name: [C++11 lex.charset]

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

877

/// \u hex-quad

878

/// \U hex-quad hex-quad

879

/// hex-quad:

880

/// hex-digit hex-digit hex-digit hex-digit

James Dennett

58f9ce1

2012-06-17 03:34:42 +0000

[diff] [blame]

881

/// \endverbatim

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

882

///

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

883

CharLiteralParser::CharLiteralParser(const char *begin, const char *end,

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

884

SourceLocation Loc, Preprocessor &PP,

885

tok::TokenKind kind) {

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

886

// At this point we know that the character matches the regex "(L|u|U)?'.*'".

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

887

HadError = false;

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

888

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

889

Kind = kind;

890

Richard Smith

2012-03-09 22:27:51 +0000

[diff] [blame]

891

const char *TokBegin = begin;

892

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

893

// Skip over wide character determinant.

894

if (Kind != tok::char_constant) {

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

895

++begin;

896

}

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

897

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

898

// Skip over the entry quote.

899

assert(begin[0] == '\'' && "Invalid token lexed");

900

++begin;

901

Richard Smith

2012-03-05 04:02:15 +0000

[diff] [blame]

902

// Remove an optional ud-suffix.

903

if (end[-1] != '\'') {

904

const char *UDSuffixEnd = end;

905

do {

906

--end;

907

} while (end[-1] != '\'');

908

UDSuffixBuf.assign(end, UDSuffixEnd);

Richard Smith

2012-03-09 22:27:51 +0000

[diff] [blame]

909

UDSuffixOffset = end - TokBegin;

Richard Smith

2012-03-05 04:02:15 +0000

[diff] [blame]

910

}

911

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

912

// Trim the ending quote.

Richard Smith

2012-03-05 04:02:15 +0000

[diff] [blame]

913

assert(end != begin && "Invalid token lexed");

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

914

--end;

915

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

916

// FIXME: The "Value" is an uint64_t so we can handle char literals of

Chris Lattner

fc8f0e1

2011-04-15 05:22:18 +0000

[diff] [blame]

917

// up to 64-bits.

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

918

// FIXME: This extensively assumes that 'char' is 8-bits.

Chris Lattner

98be494

2008-03-05 18:54:05 +0000

[diff] [blame]

919

assert(PP.getTargetInfo().getCharWidth() == 8 &&

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

920

"Assumes char is 8 bits");

Chris Lattner

2009-04-28 21:51:46 +0000

[diff] [blame]

921

assert(PP.getTargetInfo().getIntWidth() <= 64 &&

922

(PP.getTargetInfo().getIntWidth() & 7) == 0 &&

923

"Assumes sizeof(int) on target is <= 64 and a multiple of char");

924

assert(PP.getTargetInfo().getWCharWidth() <= 64 &&

925

"Assumes sizeof(wchar) on target is <= 64");

Sanjiv Gupta

4bc11af

2009-04-21 02:21:29 +0000

[diff] [blame]

926

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

927

SmallVector<uint32_t,4> codepoint_buffer;

928

codepoint_buffer.resize(end-begin);

929

uint32_t *buffer_begin = &codepoint_buffer.front();

930

uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

931

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

932

// Unicode escapes representing characters that cannot be correctly

933

// represented in a single code unit are disallowed in character literals

934

// by this implementation.

935

uint32_t largest_character_for_kind;

936

if (tok::wide_char_constant == Kind) {

937

largest_character_for_kind = 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());

938

} else if (tok::utf16_char_constant == Kind) {

939

largest_character_for_kind = 0xFFFF;

940

} else if (tok::utf32_char_constant == Kind) {

941

largest_character_for_kind = 0x10FFFF;

942

} else {

943

largest_character_for_kind = 0x7Fu;

Chris Lattner

2009-04-28 21:51:46 +0000

[diff] [blame]

944

}

945

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

946

while (begin!=end) {

947

// Is this a span of non-escape characters?

948

if (begin[0] != '\\') {

949

char const *start = begin;

950

do {

951

++begin;

952

} while (begin != end && *begin != '\\');

953

Eli Friedman

2012-02-11 05:08:10 +0000

[diff] [blame]

954

char const *tmp_in_start = start;

955

uint32_t *tmp_out_start = buffer_begin;

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

956

ConversionResult res =

957

ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),

958

reinterpret_cast<UTF8 const *>(begin),

959

&buffer_begin,buffer_end,strictConversion);

960

if (res!=conversionOK) {

Eli Friedman

2012-02-11 05:08:10 +0000

[diff] [blame]

961

// If we see bad encoding for unprefixed character literals, warn and

962

// simply copy the byte values, for compatibility with gcc and

963

// older versions of clang.

964

bool NoErrorOnBadEncoding = isAscii();

965

unsigned Msg = diag::err_bad_character_encoding;

966

if (NoErrorOnBadEncoding)

967

Msg = diag::warn_bad_character_encoding;

968

PP.Diag(Loc, Msg);

969

if (NoErrorOnBadEncoding) {

970

start = tmp_in_start;

971

buffer_begin = tmp_out_start;

972

for ( ; start != begin; ++start, ++buffer_begin)

973

*buffer_begin = static_cast<uint8_t>(*start);

974

} else {

975

HadError = true;

976

}

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

977

} else {

Eli Friedman

2012-02-11 05:08:10 +0000

[diff] [blame]

978

for (; tmp_out_start <buffer_begin; ++tmp_out_start) {

979

if (*tmp_out_start > largest_character_for_kind) {

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

980

HadError = true;

981

PP.Diag(Loc, diag::err_character_too_large);

}

}

}

continue;

}

// Is this a Universal Character Name excape?

989

if (begin[1] == 'u' || begin[1] == 'U') {

990

unsigned short UcnLen = 0;

Richard Smith

2012-03-09 22:27:51 +0000

[diff] [blame]

991

if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

992

FullSourceLoc(Loc, PP.getSourceManager()),

David Blaikie

2012-03-11 07:00:24 +0000

[diff] [blame]

993

&PP.getDiagnostics(), PP.getLangOpts(),

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

true))

{

HadError = true;

} else if (*buffer_begin > largest_character_for_kind) {

998

HadError = true;

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

999

PP.Diag(Loc, diag::err_character_too_large);

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

}

++buffer_begin;

continue;

}

unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());

1006

uint64_t result =

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

1007

ProcessCharEscape(TokBegin, begin, end, HadError,

1008

FullSourceLoc(Loc,PP.getSourceManager()),

1009

CharWidth, &PP.getDiagnostics(), PP.getLangOpts());

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

1010

*buffer_begin++ = result;

1011

}

1012

1013

unsigned NumCharsSoFar = buffer_begin-&codepoint_buffer.front();

1014

Chris Lattner

2009-04-28 21:51:46 +0000

[diff] [blame]

1015

if (NumCharsSoFar > 1) {

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

1016

if (isWide())

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

1017

PP.Diag(Loc, diag::warn_extraneous_char_constant);

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

1018

else if (isAscii() && NumCharsSoFar == 4)

1019

PP.Diag(Loc, diag::ext_four_char_character_literal);

1020

else if (isAscii())

Chris Lattner

2009-04-28 21:51:46 +0000

[diff] [blame]

1021

PP.Diag(Loc, diag::ext_multichar_character_literal);

1022

else

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

1023

PP.Diag(Loc, diag::err_multichar_utf_character_literal);

Eli Friedman

2a1c363

2009-06-01 05:25:02 +0000

[diff] [blame]

1024

IsMultiChar = true;

Daniel Dunbar

930b71a

2009-07-29 01:46:05 +0000

[diff] [blame]

1025

} else

1026

IsMultiChar = false;

Sanjiv Gupta

4bc11af

2009-04-21 02:21:29 +0000

[diff] [blame]

1027

Seth Cantrell

2012-01-18 12:27:04 +0000

[diff] [blame]

1028

llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);

1029

1030

// Narrow character literals act as though their value is concatenated

1031

// in this implementation, but warn on overflow.

1032

bool multi_char_too_long = false;

1033

if (isAscii() && isMultiChar()) {

1034

LitVal = 0;

1035

for (size_t i=0;i<NumCharsSoFar;++i) {

1036

// check for enough leading zeros to shift into

1037

multi_char_too_long |= (LitVal.countLeadingZeros() < 8);

1038

LitVal <<= 8;

1039

LitVal = LitVal + (codepoint_buffer[i] & 0xFF);

1040

}

1041

} else if (NumCharsSoFar > 0) {

1042

// otherwise just take the last character

1043

LitVal = buffer_begin[-1];

1044

}

1045

1046

if (!HadError && multi_char_too_long) {

1047

PP.Diag(Loc,diag::warn_char_constant_too_large);

1048

}

1049

Sanjiv Gupta

4bc11af

2009-04-21 02:21:29 +0000

[diff] [blame]

1050

// Transfer the value from APInt to uint64_t

1051

Value = LitVal.getZExtValue();

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1052

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1053

// If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")

1054

// if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple

1055

// character constants are not sign extended in the this implementation:

1056

// '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

1057

if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&

David Blaikie

2012-03-11 07:00:24 +0000

[diff] [blame]

1058

PP.getLangOpts().CharIsSigned)

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1059

Value = (signed char)Value;

1060

}

1061

James Dennett

a1263cf

2012-06-19 21:04:25 +0000

[diff] [blame]

1062

/// \verbatim

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

1063

/// string-literal: [C++0x lex.string]

1064

/// encoding-prefix " [s-char-sequence] "

1065

/// encoding-prefix R raw-string

/// encoding-prefix:

/// u8

/// u

/// U

/// L

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1071

/// s-char-sequence:

1072

/// s-char

1073

/// s-char-sequence s-char

1074

/// s-char:

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

1075

/// any member of the source character set except the double-quote ",

1076

/// backslash \, or new-line character

1077

/// escape-sequence

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1078

/// universal-character-name

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

1079

/// raw-string:

1080

/// " d-char-sequence ( r-char-sequence ) d-char-sequence "

1081

/// r-char-sequence:

1082

/// r-char

1083

/// r-char-sequence r-char

1084

/// r-char:

1085

/// any member of the source character set, except a right parenthesis )

1086

/// followed by the initial d-char-sequence (which may be empty)

1087

/// followed by a double quote ".

1088

/// d-char-sequence:

1089

/// d-char

1090

/// d-char-sequence d-char

1091

/// d-char:

1092

/// any member of the basic source character set except:

1093

/// space, the left parenthesis (, the right parenthesis ),

1094

/// the backslash \, and the control characters representing horizontal

1095

/// tab, vertical tab, form feed, and newline.

1096

/// escape-sequence: [C++0x lex.ccon]

1097

/// simple-escape-sequence

1098

/// octal-escape-sequence

1099

/// hexadecimal-escape-sequence

1100

/// simple-escape-sequence:

NAKAMURA Takumi

ddddd48

2011-08-12 05:49:51 +0000

[diff] [blame]

1101

/// one of \' \" \? \\ \a \b \f \n \r \t \v

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

1102

/// octal-escape-sequence:

1103

/// \ octal-digit

1104

/// \ octal-digit octal-digit

1105

/// \ octal-digit octal-digit octal-digit

1106

/// hexadecimal-escape-sequence:

1107

/// \x hexadecimal-digit

1108

/// hexadecimal-escape-sequence hexadecimal-digit

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1109

/// universal-character-name:

1110

/// \u hex-quad

1111

/// \U hex-quad hex-quad

1112

/// hex-quad:

1113

/// hex-digit hex-digit hex-digit hex-digit

James Dennett

a1263cf

2012-06-19 21:04:25 +0000

[diff] [blame]

1114

/// \endverbatim

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1115

///

1116

StringLiteralParser::

Chris Lattner

d217773

2007-07-20 16:59:19 +0000

[diff] [blame]

1117

StringLiteralParser(const Token *StringToks, unsigned NumStringToks,

Chris Lattner

2010-11-17 07:21:13 +0000

[diff] [blame]

1118

Preprocessor &PP, bool Complain)

David Blaikie

2012-03-11 07:00:24 +0000

[diff] [blame]

1119

: SM(PP.getSourceManager()), Features(PP.getLangOpts()),

Argyrios Kyrtzidis

2011-05-17 22:09:56 +0000

[diff] [blame]

1120

Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0),

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

1121

MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),

1122

ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {

Chris Lattner

2010-11-17 07:21:13 +0000

[diff] [blame]

1123

init(StringToks, NumStringToks);

1124

}

1125

1126

void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){

Argyrios Kyrtzidis

2011-05-17 22:09:56 +0000

[diff] [blame]

1127

// The literal token may have come from an invalid source location (e.g. due

1128

// to a PCH error), in which case the token length will be 0.

Argyrios Kyrtzidis

2012-05-03 17:50:32 +0000

[diff] [blame]

1129

if (NumStringToks == 0 || StringToks[0].getLength() < 2)

1130

return DiagnoseLexingError(SourceLocation());

Argyrios Kyrtzidis

2011-05-17 22:09:56 +0000

[diff] [blame]

1131

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1132

// Scan all of the string portions, remember the max individual token length,

1133

// computing a bound on the concatenated string length, and see whether any

1134

// piece is a wide-string. If any of the string portions is a wide-string

1135

// literal, the result is a wide-string literal [C99 6.4.5p4].

Argyrios Kyrtzidis

2011-05-17 22:09:56 +0000

[diff] [blame]

1136

assert(NumStringToks && "expected at least one token");

Sean Hunt

2010-08-30 17:47:05 +0000

[diff] [blame]

1137

MaxTokenLength = StringToks[0].getLength();

Argyrios Kyrtzidis

2011-05-17 22:09:56 +0000

[diff] [blame]

1138

assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");

Sean Hunt

2010-08-30 17:47:05 +0000

[diff] [blame]

1139

SizeBound = StringToks[0].getLength()-2; // -2 for "".

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

1140

Kind = StringToks[0].getKind();

Sean Hunt

2010-08-30 17:47:05 +0000

[diff] [blame]

1141

1142

hadError = false;

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1143

1144

// Implement Translation Phase #6: concatenation of string literals

1145

/// (C99 5.1.1.2p1). The common case is only one string fragment.

1146

for (unsigned i = 1; i != NumStringToks; ++i) {

Argyrios Kyrtzidis

2012-05-03 17:50:32 +0000

[diff] [blame]

1147

if (StringToks[i].getLength() < 2)

1148

return DiagnoseLexingError(StringToks[i].getLocation());

Argyrios Kyrtzidis

2011-05-17 22:09:56 +0000

[diff] [blame]

1149

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1150

// The string could be shorter than this if it needs cleaning, but this is a

1151

// reasonable bound, which is all we need.

Argyrios Kyrtzidis

2011-05-17 22:09:56 +0000

[diff] [blame]

1152

assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");

Sean Hunt

2010-08-30 17:47:05 +0000

[diff] [blame]

1153

SizeBound += StringToks[i].getLength()-2; // -2 for "".

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1154

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1155

// Remember maximum string piece length.

Sean Hunt

2010-08-30 17:47:05 +0000

[diff] [blame]

1156

if (StringToks[i].getLength() > MaxTokenLength)

1157

MaxTokenLength = StringToks[i].getLength();

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1158

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

1159

// Remember if we see any wide or utf-8/16/32 strings.

1160

// Also check for illegal concatenations.

1161

if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {

1162

if (isAscii()) {

1163

Kind = StringToks[i].getKind();

1164

} else {

1165

if (Diags)

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

1166

Diags->Report(StringToks[i].getLocation(),

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

1167

diag::err_unsupported_string_concat);

1168

hadError = true;

1169

}

1170

}

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1171

}

Chris Lattner

dbb1ecc

2009-02-26 23:01:51 +0000

[diff] [blame]

1172

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1173

// Include space for the null terminator.

1174

++SizeBound;

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1175

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1176

// TODO: K&R warning: "traditional C rejects string constant concatenation"

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1177

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

1178

// Get the width in bytes of char/wchar_t/char16_t/char32_t

1179

CharByteWidth = getCharWidth(Kind, Target);

1180

assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");

1181

CharByteWidth /= 8;

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1182

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1183

// The output buffer size needs to be large enough to hold wide characters.

1184

// This is a worst-case assumption which basically corresponds to L"" "long".

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

1185

SizeBound *= CharByteWidth;

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1186

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1187

// Size the temporary buffer to hold the result string data.

1188

ResultBuf.resize(SizeBound);

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1189

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1190

// Likewise, but for each string piece.

Dylan Noblesmith

f7ccbad

2012-02-05 02:13:05 +0000

[diff] [blame]

1191

SmallString<512> TokenBuf;

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1192

TokenBuf.resize(MaxTokenLength);

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1193

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1194

// Loop over all the strings, getting their spelling, and expanding them to

1195

// wide strings as appropriate.

1196

ResultPtr = &ResultBuf[0]; // Next byte to fill in.

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1197

Anders Carlsson

ee98ac5

2007-10-15 02:50:23 +0000

[diff] [blame]

1198

Pascal = false;

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1199

Richard Smith

2012-03-05 04:02:15 +0000

[diff] [blame]

1200

SourceLocation UDSuffixTokLoc;

1201

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1202

for (unsigned i = 0, e = NumStringToks; i != e; ++i) {

1203

const char *ThisTokBuf = &TokenBuf[0];

1204

// Get the spelling of the token, which eliminates trigraphs, etc. We know

1205

// that ThisTokBuf points to a buffer that is big enough for the whole token

1206

// and 'spelled' tokens can only shrink.

Douglas Gregor

2010-03-16 05:20:39 +0000

[diff] [blame]

1207

bool StringInvalid = false;

Chris Lattner

2010-11-17 07:21:13 +0000

[diff] [blame]

1208

unsigned ThisTokLen =

Chris Lattner

b060727

2010-11-17 07:26:20 +0000

[diff] [blame]

1209

Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,

1210

&StringInvalid);

Argyrios Kyrtzidis

2012-05-03 17:50:32 +0000

[diff] [blame]

1211

if (StringInvalid)

1212

return DiagnoseLexingError(StringToks[i].getLocation());

Douglas Gregor

2010-03-16 05:20:39 +0000

[diff] [blame]

1213

Richard Smith

2012-03-09 22:27:51 +0000

[diff] [blame]

1214

const char *ThisTokBegin = ThisTokBuf;

Richard Smith

2012-03-05 04:02:15 +0000

[diff] [blame]

1215

const char *ThisTokEnd = ThisTokBuf+ThisTokLen;

1216

1217

// Remove an optional ud-suffix.

1218

if (ThisTokEnd[-1] != '"') {

1219

const char *UDSuffixEnd = ThisTokEnd;

1220

do {

1221

--ThisTokEnd;

1222

} while (ThisTokEnd[-1] != '"');

1223

1224

StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);

1225

1226

if (UDSuffixBuf.empty()) {

1227

UDSuffixBuf.assign(UDSuffix);

Richard Smith

dd66be7

2012-03-08 01:34:56 +0000

[diff] [blame]

1228

UDSuffixToken = i;

1229

UDSuffixOffset = ThisTokEnd - ThisTokBuf;

Richard Smith

2012-03-05 04:02:15 +0000

[diff] [blame]

1230

UDSuffixTokLoc = StringToks[i].getLocation();

1231

} else if (!UDSuffixBuf.equals(UDSuffix)) {

1232

// C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the

1233

// result of a concatenation involving at least one user-defined-string-

1234

// literal, all the participating user-defined-string-literals shall

1235

// have the same ud-suffix.

1236

if (Diags) {

1237

SourceLocation TokLoc = StringToks[i].getLocation();

1238

Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)

1239

<< UDSuffixBuf << UDSuffix

1240

<< SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)

1241

<< SourceRange(TokLoc, TokLoc);

}

hadError = true;

}

}

// Strip the end quote.

1248

--ThisTokEnd;

1249

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1250

// TODO: Input character set mapping support.

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1251

Craig Topper

1661d71

2011-08-08 06:10:39 +0000

[diff] [blame]

1252

// Skip marker for wide or unicode strings.

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

1253

if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1254

++ThisTokBuf;

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

1255

// Skip 8 of u8 marker for utf8 strings.

1256

if (ThisTokBuf[0] == '8')

1257

++ThisTokBuf;

Fariborz Jahanian

56bedef

2010-08-31 23:34:27 +0000

[diff] [blame]

1258

}

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1259

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

1260

// Check for raw string

1261

if (ThisTokBuf[0] == 'R') {

1262

ThisTokBuf += 2; // skip R"

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1263

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

1264

const char *Prefix = ThisTokBuf;

1265

while (ThisTokBuf[0] != '(')

Anders Carlsson

ee98ac5

2007-10-15 02:50:23 +0000

[diff] [blame]

1266

++ThisTokBuf;

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

1267

++ThisTokBuf; // skip '('

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1268

Richard Smith

49d5174

2012-03-08 21:59:28 +0000

[diff] [blame]

1269

// Remove same number of characters from the end

1270

ThisTokEnd -= ThisTokBuf - Prefix;

1271

assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

1272

1273

// Copy the string over

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

1274

if (CopyStringFragment(StringToks[i], ThisTokBegin,

1275

StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)))

1276

hadError = true;

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

1277

} else {

Argyrios Kyrtzidis

07a0758

2012-05-03 01:01:56 +0000

[diff] [blame]

1278

if (ThisTokBuf[0] != '"') {

1279

// The file may have come from PCH and then changed after loading the

1280

// PCH; Fail gracefully.

Argyrios Kyrtzidis

2012-05-03 17:50:32 +0000

[diff] [blame]

1281

return DiagnoseLexingError(StringToks[i].getLocation());

Argyrios Kyrtzidis

07a0758

2012-05-03 01:01:56 +0000

[diff] [blame]

1282

}

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

1283

++ThisTokBuf; // skip "

1284

1285

// Check if this is a pascal string

1286

if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&

1287

ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {

1288

1289

// If the \p sequence is found in the first token, we have a pascal string

1290

// Otherwise, if we already have a pascal string, ignore the first \p

1291

if (i == 0) {

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1292

++ThisTokBuf;

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

Pascal = true;

} else if (Pascal)

ThisTokBuf += 2;

}

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1297

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

1298

while (ThisTokBuf != ThisTokEnd) {

1299

// Is this a span of non-escape characters?

1300

if (ThisTokBuf[0] != '\\') {

1301

const char *InStart = ThisTokBuf;

1302

do {

1303

++ThisTokBuf;

1304

} while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');

1305

1306

// Copy the character span over.

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

1307

if (CopyStringFragment(StringToks[i], ThisTokBegin,

1308

StringRef(InStart, ThisTokBuf - InStart)))

1309

hadError = true;

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

1310

continue;

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1311

}

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

1312

// Is this a Universal Character Name escape?

1313

if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {

Richard Smith

2012-03-09 22:27:51 +0000

[diff] [blame]

1314

EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,

1315

ResultPtr, hadError,

1316

FullSourceLoc(StringToks[i].getLocation(), SM),

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

1317

CharByteWidth, Diags, Features);

1318

continue;

1319

}

1320

// Otherwise, this is a non-UCN escape character. Process it.

1321

unsigned ResultChar =

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

1322

ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

1323

FullSourceLoc(StringToks[i].getLocation(), SM),

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

1324

CharByteWidth*8, Diags, Features);

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1325

Eli Friedman

2011-11-02 23:06:23 +0000

[diff] [blame]

1326

if (CharByteWidth == 4) {

1327

// FIXME: Make the type of the result buffer correct instead of

1328

// using reinterpret_cast.

1329

UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultPtr);

Nico Weber

9b483df

2011-11-14 05:17:37 +0000

[diff] [blame]

1330

*ResultWidePtr = ResultChar;

Eli Friedman

2011-11-02 23:06:23 +0000

[diff] [blame]

1331

ResultPtr += 4;

1332

} else if (CharByteWidth == 2) {

1333

// FIXME: Make the type of the result buffer correct instead of

1334

// using reinterpret_cast.

1335

UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultPtr);

Nico Weber

9b483df

2011-11-14 05:17:37 +0000

[diff] [blame]

1336

*ResultWidePtr = ResultChar & 0xFFFF;

Eli Friedman

2011-11-02 23:06:23 +0000

[diff] [blame]

1337

ResultPtr += 2;

1338

} else {

1339

assert(CharByteWidth == 1 && "Unexpected char width");

1340

*ResultPtr++ = ResultChar & 0xFF;

1341

}

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

1342

}

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1343

}

1344

}

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1345

Chris Lattner

bbee00b

2009-01-16 18:51:42 +0000

[diff] [blame]

1346

if (Pascal) {

Eli Friedman

22508f4

2011-11-05 00:41:04 +0000

[diff] [blame]

1347

if (CharByteWidth == 4) {

1348

// FIXME: Make the type of the result buffer correct instead of

1349

// using reinterpret_cast.

1350

UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultBuf.data());

1351

ResultWidePtr[0] = GetNumStringChars() - 1;

1352

} else if (CharByteWidth == 2) {

1353

// FIXME: Make the type of the result buffer correct instead of

1354

// using reinterpret_cast.

1355

UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultBuf.data());

1356

ResultWidePtr[0] = GetNumStringChars() - 1;

1357

} else {

1358

assert(CharByteWidth == 1 && "Unexpected char width");

1359

ResultBuf[0] = GetNumStringChars() - 1;

1360

}

Chris Lattner

bbee00b

2009-01-16 18:51:42 +0000

[diff] [blame]

1361

1362

// Verify that pascal strings aren't too large.

Chris Lattner

2010-11-17 07:21:13 +0000

[diff] [blame]

1363

if (GetStringLength() > 256) {

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

1364

if (Diags)

1365

Diags->Report(StringToks[0].getLocation(),

Chris Lattner

2010-11-17 07:21:13 +0000

[diff] [blame]

1366

diag::err_pascal_string_too_long)

1367

<< SourceRange(StringToks[0].getLocation(),

1368

StringToks[NumStringToks-1].getLocation());

Douglas Gregor

2011-07-27 05:40:30 +0000

[diff] [blame]

1369

hadError = true;

Eli Friedman

57d7dde

2009-04-01 03:17:08 +0000

[diff] [blame]

1370

return;

1371

}

Chris Lattner

2010-11-17 07:21:13 +0000

[diff] [blame]

1372

} else if (Diags) {

Douglas Gregor

2010-07-20 14:33:20 +0000

[diff] [blame]

1373

// Complain if this string literal has too many characters.

Chris Lattner

a95880d

2010-11-17 07:12:42 +0000

[diff] [blame]

1374

unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;

Douglas Gregor

2010-07-20 14:33:20 +0000

[diff] [blame]

1375

1376

if (GetNumStringChars() > MaxChars)

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

1377

Diags->Report(StringToks[0].getLocation(),

Chris Lattner

2010-11-17 07:21:13 +0000

[diff] [blame]

1378

diag::ext_string_too_long)

Douglas Gregor

2010-07-20 14:33:20 +0000

[diff] [blame]

1379

<< GetNumStringChars() << MaxChars

Chris Lattner

a95880d

2010-11-17 07:12:42 +0000

[diff] [blame]

1380

<< (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)

Douglas Gregor

2010-07-20 14:33:20 +0000

[diff] [blame]

1381

<< SourceRange(StringToks[0].getLocation(),

1382

StringToks[NumStringToks-1].getLocation());

Chris Lattner

bbee00b

2009-01-16 18:51:42 +0000

[diff] [blame]

1383

}

Reid Spencer

2007-07-11 17:01:13 +0000

[diff] [blame]

1384

}

Chris Lattner

2009-02-18 19:21:10 +0000

[diff] [blame]

1385

Seth Cantrell

2012-10-28 18:24:46 +0000

[diff] [blame]

1386

static const char *resync_utf8(const char *err, const char *end) {

1387

if (err==end)

1388

return end;

1389

end = err + std::min<unsigned>(getNumBytesForUTF8(*err), end-err);

1390

while (++err!=end && (*err&0xC0)==0x80)

;

return err;

}

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

1395

/// \brief This function copies from Fragment, which is a sequence of bytes

1396

/// within Tok's contents (which begin at TokBegin) into ResultPtr.

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

1397

/// Performs widening for multi-byte characters.

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

1398

bool StringLiteralParser::CopyStringFragment(const Token &Tok,

1399

const char *TokBegin,

1400

StringRef Fragment) {

1401

const UTF8 *ErrorPtrTmp;

1402

if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))

1403

return false;

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

1404

Eli Friedman

2012-02-11 05:08:10 +0000

[diff] [blame]

1405

// If we see bad encoding for unprefixed string literals, warn and

1406

// simply copy the byte values, for compatibility with gcc and older

1407

// versions of clang.

1408

bool NoErrorOnBadEncoding = isAscii();

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

1409

if (NoErrorOnBadEncoding) {

1410

memcpy(ResultPtr, Fragment.data(), Fragment.size());

1411

ResultPtr += Fragment.size();

1412

}

Seth Cantrell

2012-10-28 18:24:46 +0000

[diff] [blame]

1413

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

1414

if (Diags) {

Seth Cantrell

2012-10-28 18:24:46 +0000

[diff] [blame]

1415

const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);

1416

1417

FullSourceLoc SourceLoc(Tok.getLocation(), SM);

1418

const DiagnosticBuilder &Builder =

1419

Diag(Diags, Features, SourceLoc, TokBegin,

1420

ErrorPtr, resync_utf8(ErrorPtr, Fragment.end()),

1421

NoErrorOnBadEncoding ? diag::warn_bad_string_encoding

1422

: diag::err_bad_string_encoding);

1423

1424

char *SavedResultPtr = ResultPtr;

1425

const char *NextStart = resync_utf8(ErrorPtr, Fragment.end());

1426

StringRef NextFragment(NextStart, Fragment.end()-NextStart);

1427

David Blaikie

82c6dc7

2012-10-30 23:22:22 +0000

[diff] [blame^]

1428

while (!Builder.hasMaxRanges() &&

1429

!ConvertUTF8toWide(CharByteWidth, NextFragment, ResultPtr,

Seth Cantrell

2012-10-28 18:24:46 +0000

[diff] [blame]

1430

ErrorPtrTmp)) {

1431

const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);

1432

NextStart = resync_utf8(ErrorPtr, Fragment.end());

1433

Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,

1434

ErrorPtr, NextStart);

1435

NextFragment = StringRef(NextStart, Fragment.end()-NextStart);

1436

}

1437

1438

ResultPtr = SavedResultPtr;

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

1439

}

Eli Friedman

2012-02-11 05:08:10 +0000

[diff] [blame]

1440

return !NoErrorOnBadEncoding;

1441

}

Craig Topper

2011-08-11 04:06:15 +0000

[diff] [blame]

1442

Argyrios Kyrtzidis

2012-05-03 17:50:32 +0000

[diff] [blame]

1443

void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {

1444

hadError = true;

1445

if (Diags)

1446

Diags->Report(Loc, diag::err_lexing_string);

1447

}

1448

Chris Lattner

2009-02-18 19:21:10 +0000

[diff] [blame]

1449

/// getOffsetOfStringByte - This function returns the offset of the

1450

/// specified byte of the string data represented by Token. This handles

1451

/// advancing over escape sequences in the string.

1452

unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,

Chris Lattner

6c66f07

2010-11-17 06:46:14 +0000

[diff] [blame]

1453

unsigned ByteNo) const {

Chris Lattner

2009-02-18 19:21:10 +0000

[diff] [blame]

1454

// Get the spelling of the token.

Dylan Noblesmith

f7ccbad

2012-02-05 02:13:05 +0000

[diff] [blame]

1455

SmallString<32> SpellingBuffer;

Sean Hunt

2010-08-30 17:47:05 +0000

[diff] [blame]

1456

SpellingBuffer.resize(Tok.getLength());

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1457

Douglas Gregor

2010-03-16 05:20:39 +0000

[diff] [blame]

1458

bool StringInvalid = false;

Chris Lattner

2009-02-18 19:21:10 +0000

[diff] [blame]

1459

const char *SpellingPtr = &SpellingBuffer[0];

Chris Lattner

b060727

2010-11-17 07:26:20 +0000

[diff] [blame]

1460

unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,

1461

&StringInvalid);

Chris Lattner

2010-11-17 06:26:08 +0000

[diff] [blame]

1462

if (StringInvalid)

Douglas Gregor

2010-03-16 05:20:39 +0000

[diff] [blame]

1463

return 0;

Chris Lattner

2009-02-18 19:21:10 +0000

[diff] [blame]

1464

Chris Lattner

2009-02-18 19:21:10 +0000

[diff] [blame]

1465

const char *SpellingStart = SpellingPtr;

1466

const char *SpellingEnd = SpellingPtr+TokLen;

1467

Richard Smith

2012-06-13 05:37:23 +0000

[diff] [blame]

1468

// Handle UTF-8 strings just like narrow strings.

1469

if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')

1470

SpellingPtr += 2;

1471

1472

assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&

1473

SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");

1474

1475

// For raw string literals, this is easy.

1476

if (SpellingPtr[0] == 'R') {

1477

assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");

1478

// Skip 'R"'.

1479

SpellingPtr += 2;

1480

while (*SpellingPtr != '(') {

1481

++SpellingPtr;

1482

assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");

}

// Skip '('.

++SpellingPtr;

return SpellingPtr - SpellingStart + ByteNo;

1487

}

1488

1489

// Skip over the leading quote

Chris Lattner

2009-02-18 19:21:10 +0000

[diff] [blame]

1490

assert(SpellingPtr[0] == '"' && "Should be a string literal!");

1491

++SpellingPtr;

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1492

Chris Lattner

2009-02-18 19:21:10 +0000

[diff] [blame]

1493

// Skip over bytes until we find the offset we're looking for.

1494

while (ByteNo) {

1495

assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1496

Chris Lattner

2009-02-18 19:21:10 +0000

[diff] [blame]

1497

// Step over non-escapes simply.

1498

if (*SpellingPtr != '\\') {

++SpellingPtr;

--ByteNo;

continue;

}

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1503

Chris Lattner

2009-02-18 19:21:10 +0000

[diff] [blame]

1504

// Otherwise, this is an escape character. Advance over it.

1505

bool HadError = false;

Richard Smith

2012-06-13 05:37:23 +0000

[diff] [blame]

1506

if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') {

1507

const char *EscapePtr = SpellingPtr;

1508

unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,

1509

1, Features, HadError);

1510

if (Len > ByteNo) {

1511

// ByteNo is somewhere within the escape sequence.

1512

SpellingPtr = EscapePtr;

break;

}

ByteNo -= Len;

} else {

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

1517

ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,

Richard Smith

2012-06-13 05:37:23 +0000

[diff] [blame]

1518

FullSourceLoc(Tok.getLocation(), SM),

Richard Smith

2012-09-08 07:16:20 +0000

[diff] [blame]

1519

CharByteWidth*8, Diags, Features);

Richard Smith

2012-06-13 05:37:23 +0000

[diff] [blame]

1520

--ByteNo;

1521

}

Chris Lattner

2009-02-18 19:21:10 +0000

[diff] [blame]

1522

assert(!HadError && "This method isn't valid on erroneous strings");

Chris Lattner

2009-02-18 19:21:10 +0000

[diff] [blame]

1523

}

Mike Stump

2009-09-09 15:08:12 +0000

[diff] [blame]

1524

Chris Lattner