blob: e4441c13f75f1ace95f8b9ba7613e18574d6721d [file] [log] [blame]
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00001#include "clang/AST/CommentLexer.h"
Dmitri Gribenkoaa580812012-08-09 00:03:17 +00002#include "clang/AST/CommentCommandTraits.h"
Dmitri Gribenkoc934dfe2013-01-19 22:06:05 +00003#include "llvm/ADT/StringExtras.h"
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00004#include "llvm/ADT/StringSwitch.h"
Dmitri Gribenkocb5620c2013-01-30 12:06:08 +00005#include "llvm/Support/ConvertUTF.h"
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00006#include "llvm/Support/ErrorHandling.h"
7
8namespace clang {
9namespace comments {
10
11void Token::dump(const Lexer &L, const SourceManager &SM) const {
12 llvm::errs() << "comments::Token Kind=" << Kind << " ";
13 Loc.dump(SM);
14 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
15}
16
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000017namespace {
18bool isHTMLNamedCharacterReferenceCharacter(char C) {
19 return (C >= 'a' && C <= 'z') ||
20 (C >= 'A' && C <= 'Z');
21}
22
23bool isHTMLDecimalCharacterReferenceCharacter(char C) {
24 return C >= '0' && C <= '9';
25}
26
27bool isHTMLHexCharacterReferenceCharacter(char C) {
28 return (C >= '0' && C <= '9') ||
29 (C >= 'a' && C <= 'f') ||
30 (C >= 'A' && C <= 'F');
31}
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +000032
Dmitri Gribenko5bd1e5b2013-01-30 14:29:28 +000033StringRef convertCodePointToUTF8(llvm::BumpPtrAllocator &Allocator,
34 unsigned CodePoint) {
Fariborz Jahanian658a1152013-01-29 23:42:26 +000035 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
36 char *ResolvedPtr = Resolved;
Dmitri Gribenkocb5620c2013-01-30 12:06:08 +000037 if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
Fariborz Jahanian658a1152013-01-29 23:42:26 +000038 return StringRef(Resolved, ResolvedPtr - Resolved);
39 else
40 return StringRef();
41}
Dmitri Gribenko5bd1e5b2013-01-30 14:29:28 +000042
43#include "clang/AST/CommentHTMLTags.inc"
44#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
45
46} // unnamed namespace
Fariborz Jahanian658a1152013-01-29 23:42:26 +000047
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000048StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
Dmitri Gribenko5bd1e5b2013-01-30 14:29:28 +000049 // Fast path, first check a few most widely used named character references.
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000050 return llvm::StringSwitch<StringRef>(Name)
51 .Case("amp", "&")
52 .Case("lt", "<")
53 .Case("gt", ">")
54 .Case("quot", "\"")
55 .Case("apos", "\'")
Dmitri Gribenko5bd1e5b2013-01-30 14:29:28 +000056 // Slow path.
57 .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
Fariborz Jahanian658a1152013-01-29 23:42:26 +000058}
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000059
60StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
61 unsigned CodePoint = 0;
62 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
63 assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
64 CodePoint *= 10;
65 CodePoint += Name[i] - '0';
66 }
Dmitri Gribenko5bd1e5b2013-01-30 14:29:28 +000067 return convertCodePointToUTF8(Allocator, CodePoint);
68}
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000069
Dmitri Gribenko5bd1e5b2013-01-30 14:29:28 +000070StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
71 unsigned CodePoint = 0;
72 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
73 CodePoint *= 16;
74 const char C = Name[i];
75 assert(isHTMLHexCharacterReferenceCharacter(C));
76 CodePoint += llvm::hexDigitValue(C);
77 }
78 return convertCodePointToUTF8(Allocator, CodePoint);
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000079}
80
Dmitri Gribenko2d44d772012-06-26 20:39:18 +000081void Lexer::skipLineStartingDecorations() {
82 // This function should be called only for C comments
83 assert(CommentState == LCS_InsideCComment);
84
85 if (BufferPtr == CommentEnd)
86 return;
87
88 switch (*BufferPtr) {
89 case ' ':
90 case '\t':
91 case '\f':
92 case '\v': {
93 const char *NewBufferPtr = BufferPtr;
94 NewBufferPtr++;
95 if (NewBufferPtr == CommentEnd)
96 return;
97
98 char C = *NewBufferPtr;
99 while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
100 NewBufferPtr++;
101 if (NewBufferPtr == CommentEnd)
102 return;
103 C = *NewBufferPtr;
104 }
105 if (C == '*')
106 BufferPtr = NewBufferPtr + 1;
107 break;
108 }
109 case '*':
110 BufferPtr++;
111 break;
112 }
113}
114
115namespace {
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000116/// Returns pointer to the first newline character in the string.
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000117const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
118 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
119 const char C = *BufferPtr;
120 if (C == '\n' || C == '\r')
121 return BufferPtr;
122 }
123 return BufferEnd;
124}
125
126const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
127 if (BufferPtr == BufferEnd)
128 return BufferPtr;
129
130 if (*BufferPtr == '\n')
131 BufferPtr++;
132 else {
133 assert(*BufferPtr == '\r');
134 BufferPtr++;
135 if (BufferPtr != BufferEnd && *BufferPtr == '\n')
136 BufferPtr++;
137 }
138 return BufferPtr;
139}
140
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000141const char *skipNamedCharacterReference(const char *BufferPtr,
142 const char *BufferEnd) {
143 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
144 if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
145 return BufferPtr;
146 }
147 return BufferEnd;
148}
149
150const char *skipDecimalCharacterReference(const char *BufferPtr,
151 const char *BufferEnd) {
152 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
153 if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
154 return BufferPtr;
155 }
156 return BufferEnd;
157}
158
159const char *skipHexCharacterReference(const char *BufferPtr,
160 const char *BufferEnd) {
161 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
162 if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
163 return BufferPtr;
164 }
165 return BufferEnd;
166}
167
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000168bool isHTMLIdentifierStartingCharacter(char C) {
169 return (C >= 'a' && C <= 'z') ||
170 (C >= 'A' && C <= 'Z');
171}
172
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000173bool isHTMLIdentifierCharacter(char C) {
174 return (C >= 'a' && C <= 'z') ||
175 (C >= 'A' && C <= 'Z') ||
176 (C >= '0' && C <= '9');
177}
178
179const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
180 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
181 if (!isHTMLIdentifierCharacter(*BufferPtr))
182 return BufferPtr;
183 }
184 return BufferEnd;
185}
186
187/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
188/// string allowed.
189///
190/// Returns pointer to closing quote.
191const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
192{
193 const char Quote = *BufferPtr;
194 assert(Quote == '\"' || Quote == '\'');
195
196 BufferPtr++;
197 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
198 const char C = *BufferPtr;
199 if (C == Quote && BufferPtr[-1] != '\\')
200 return BufferPtr;
201 }
202 return BufferEnd;
203}
204
205bool isHorizontalWhitespace(char C) {
206 return C == ' ' || C == '\t' || C == '\f' || C == '\v';
207}
208
209bool isWhitespace(char C) {
210 return C == ' ' || C == '\n' || C == '\r' ||
211 C == '\t' || C == '\f' || C == '\v';
212}
213
214const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
215 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
216 if (!isWhitespace(*BufferPtr))
217 return BufferPtr;
218 }
219 return BufferEnd;
220}
221
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000222bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
223 return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
224}
225
Dmitri Gribenko8c05da32012-09-14 16:35:35 +0000226bool isCommandNameStartCharacter(char C) {
227 return (C >= 'a' && C <= 'z') ||
228 (C >= 'A' && C <= 'Z');
229}
230
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000231bool isCommandNameCharacter(char C) {
232 return (C >= 'a' && C <= 'z') ||
233 (C >= 'A' && C <= 'Z') ||
234 (C >= '0' && C <= '9');
235}
236
237const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
238 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
239 if (!isCommandNameCharacter(*BufferPtr))
240 return BufferPtr;
241 }
242 return BufferEnd;
243}
244
245/// Return the one past end pointer for BCPL comments.
246/// Handles newlines escaped with backslash or trigraph for backslahs.
247const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
248 const char *CurPtr = BufferPtr;
249 while (CurPtr != BufferEnd) {
250 char C = *CurPtr;
251 while (C != '\n' && C != '\r') {
252 CurPtr++;
253 if (CurPtr == BufferEnd)
254 return BufferEnd;
255 C = *CurPtr;
256 }
257 // We found a newline, check if it is escaped.
258 const char *EscapePtr = CurPtr - 1;
259 while(isHorizontalWhitespace(*EscapePtr))
260 EscapePtr--;
261
262 if (*EscapePtr == '\\' ||
263 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
264 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
265 // We found an escaped newline.
266 CurPtr = skipNewline(CurPtr, BufferEnd);
267 } else
268 return CurPtr; // Not an escaped newline.
269 }
270 return BufferEnd;
271}
272
273/// Return the one past end pointer for C comments.
274/// Very dumb, does not handle escaped newlines or trigraphs.
275const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
276 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
277 if (*BufferPtr == '*') {
278 assert(BufferPtr + 1 != BufferEnd);
279 if (*(BufferPtr + 1) == '/')
280 return BufferPtr;
281 }
282 }
283 llvm_unreachable("buffer end hit before '*/' was seen");
284}
285} // unnamed namespace
286
287void Lexer::lexCommentText(Token &T) {
288 assert(CommentState == LCS_InsideBCPLComment ||
289 CommentState == LCS_InsideCComment);
290
291 switch (State) {
292 case LS_Normal:
293 break;
294 case LS_VerbatimBlockFirstLine:
295 lexVerbatimBlockFirstLine(T);
296 return;
297 case LS_VerbatimBlockBody:
298 lexVerbatimBlockBody(T);
299 return;
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000300 case LS_VerbatimLineText:
301 lexVerbatimLineText(T);
302 return;
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000303 case LS_HTMLStartTag:
304 lexHTMLStartTag(T);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000305 return;
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000306 case LS_HTMLEndTag:
307 lexHTMLEndTag(T);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000308 return;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000309 }
310
311 assert(State == LS_Normal);
312
313 const char *TokenPtr = BufferPtr;
314 assert(TokenPtr < CommentEnd);
315 while (TokenPtr != CommentEnd) {
316 switch(*TokenPtr) {
317 case '\\':
318 case '@': {
319 TokenPtr++;
320 if (TokenPtr == CommentEnd) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000321 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000322 return;
323 }
324 char C = *TokenPtr;
325 switch (C) {
326 default:
327 break;
328
329 case '\\': case '@': case '&': case '$':
330 case '#': case '<': case '>': case '%':
331 case '\"': case '.': case ':':
332 // This is one of \\ \@ \& \$ etc escape sequences.
333 TokenPtr++;
334 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
335 // This is the \:: escape sequence.
336 TokenPtr++;
337 }
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000338 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000339 formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000340 T.setText(UnescapedText);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000341 return;
342 }
343
344 // Don't make zero-length commands.
Dmitri Gribenko8c05da32012-09-14 16:35:35 +0000345 if (!isCommandNameStartCharacter(*TokenPtr)) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000346 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000347 return;
348 }
349
350 TokenPtr = skipCommandName(TokenPtr, CommentEnd);
351 unsigned Length = TokenPtr - (BufferPtr + 1);
352
353 // Hardcoded support for lexing LaTeX formula commands
354 // \f$ \f[ \f] \f{ \f} as a single command.
355 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
356 C = *TokenPtr;
357 if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
358 TokenPtr++;
359 Length++;
360 }
361 }
362
363 const StringRef CommandName(BufferPtr + 1, Length);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000364
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000365 const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
366 if (!Info) {
367 formTokenWithChars(T, TokenPtr, tok::unknown_command);
368 T.setUnknownCommandName(CommandName);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000369 return;
370 }
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000371 if (Info->IsVerbatimBlockCommand) {
372 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
373 return;
374 }
375 if (Info->IsVerbatimLineCommand) {
376 setupAndLexVerbatimLine(T, TokenPtr, Info);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000377 return;
378 }
379 formTokenWithChars(T, TokenPtr, tok::command);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000380 T.setCommandID(Info->getID());
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000381 return;
382 }
383
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000384 case '&':
385 lexHTMLCharacterReference(T);
386 return;
387
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000388 case '<': {
389 TokenPtr++;
390 if (TokenPtr == CommentEnd) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000391 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000392 return;
393 }
394 const char C = *TokenPtr;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000395 if (isHTMLIdentifierStartingCharacter(C))
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000396 setupAndLexHTMLStartTag(T);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000397 else if (C == '/')
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000398 setupAndLexHTMLEndTag(T);
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000399 else
400 formTextToken(T, TokenPtr);
401
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000402 return;
403 }
404
405 case '\n':
406 case '\r':
407 TokenPtr = skipNewline(TokenPtr, CommentEnd);
408 formTokenWithChars(T, TokenPtr, tok::newline);
409
410 if (CommentState == LCS_InsideCComment)
411 skipLineStartingDecorations();
412 return;
413
414 default: {
Dmitri Gribenkoaa7dbaf2012-12-30 19:45:46 +0000415 size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
416 find_first_of("\n\r\\@&<");
417 if (End != StringRef::npos)
418 TokenPtr += End;
419 else
420 TokenPtr = CommentEnd;
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000421 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000422 return;
423 }
424 }
425 }
426}
427
428void Lexer::setupAndLexVerbatimBlock(Token &T,
429 const char *TextBegin,
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000430 char Marker, const CommandInfo *Info) {
431 assert(Info->IsVerbatimBlockCommand);
432
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000433 VerbatimBlockEndCommandName.clear();
434 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000435 VerbatimBlockEndCommandName.append(Info->EndCommandName);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000436
437 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000438 T.setVerbatimBlockID(Info->getID());
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000439
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000440 // If there is a newline following the verbatim opening command, skip the
441 // newline so that we don't create an tok::verbatim_block_line with empty
442 // text content.
443 if (BufferPtr != CommentEnd) {
444 const char C = *BufferPtr;
445 if (C == '\n' || C == '\r') {
446 BufferPtr = skipNewline(BufferPtr, CommentEnd);
447 State = LS_VerbatimBlockBody;
448 return;
449 }
450 }
451
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000452 State = LS_VerbatimBlockFirstLine;
453}
454
455void Lexer::lexVerbatimBlockFirstLine(Token &T) {
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000456again:
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000457 assert(BufferPtr < CommentEnd);
458
459 // FIXME: It would be better to scan the text once, finding either the block
460 // end command or newline.
461 //
462 // Extract current line.
463 const char *Newline = findNewline(BufferPtr, CommentEnd);
464 StringRef Line(BufferPtr, Newline - BufferPtr);
465
466 // Look for end command in current line.
467 size_t Pos = Line.find(VerbatimBlockEndCommandName);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000468 const char *TextEnd;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000469 const char *NextLine;
470 if (Pos == StringRef::npos) {
471 // Current line is completely verbatim.
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000472 TextEnd = Newline;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000473 NextLine = skipNewline(Newline, CommentEnd);
474 } else if (Pos == 0) {
475 // Current line contains just an end command.
476 const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000477 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000478 formTokenWithChars(T, End, tok::verbatim_block_end);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000479 T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000480 State = LS_Normal;
481 return;
482 } else {
483 // There is some text, followed by end command. Extract text first.
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000484 TextEnd = BufferPtr + Pos;
485 NextLine = TextEnd;
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000486 // If there is only whitespace before end command, skip whitespace.
487 if (isWhitespace(BufferPtr, TextEnd)) {
488 BufferPtr = TextEnd;
489 goto again;
490 }
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000491 }
492
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000493 StringRef Text(BufferPtr, TextEnd - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000494 formTokenWithChars(T, NextLine, tok::verbatim_block_line);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000495 T.setVerbatimBlockText(Text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000496
497 State = LS_VerbatimBlockBody;
498}
499
500void Lexer::lexVerbatimBlockBody(Token &T) {
501 assert(State == LS_VerbatimBlockBody);
502
503 if (CommentState == LCS_InsideCComment)
504 skipLineStartingDecorations();
505
506 lexVerbatimBlockFirstLine(T);
507}
508
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000509void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
510 const CommandInfo *Info) {
511 assert(Info->IsVerbatimLineCommand);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000512 formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000513 T.setVerbatimLineID(Info->getID());
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000514
515 State = LS_VerbatimLineText;
516}
517
518void Lexer::lexVerbatimLineText(Token &T) {
519 assert(State == LS_VerbatimLineText);
520
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000521 // Extract current line.
522 const char *Newline = findNewline(BufferPtr, CommentEnd);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000523 const StringRef Text(BufferPtr, Newline - BufferPtr);
524 formTokenWithChars(T, Newline, tok::verbatim_line_text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000525 T.setVerbatimLineText(Text);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000526
527 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000528}
529
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000530void Lexer::lexHTMLCharacterReference(Token &T) {
531 const char *TokenPtr = BufferPtr;
532 assert(*TokenPtr == '&');
533 TokenPtr++;
534 if (TokenPtr == CommentEnd) {
535 formTextToken(T, TokenPtr);
536 return;
537 }
538 const char *NamePtr;
539 bool isNamed = false;
540 bool isDecimal = false;
541 char C = *TokenPtr;
542 if (isHTMLNamedCharacterReferenceCharacter(C)) {
543 NamePtr = TokenPtr;
544 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
545 isNamed = true;
546 } else if (C == '#') {
547 TokenPtr++;
548 if (TokenPtr == CommentEnd) {
549 formTextToken(T, TokenPtr);
550 return;
551 }
552 C = *TokenPtr;
553 if (isHTMLDecimalCharacterReferenceCharacter(C)) {
554 NamePtr = TokenPtr;
555 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
556 isDecimal = true;
557 } else if (C == 'x' || C == 'X') {
558 TokenPtr++;
559 NamePtr = TokenPtr;
560 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
561 } else {
562 formTextToken(T, TokenPtr);
563 return;
564 }
565 } else {
566 formTextToken(T, TokenPtr);
567 return;
568 }
569 if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
570 *TokenPtr != ';') {
571 formTextToken(T, TokenPtr);
572 return;
573 }
574 StringRef Name(NamePtr, TokenPtr - NamePtr);
575 TokenPtr++; // Skip semicolon.
576 StringRef Resolved;
Dmitri Gribenko5bd1e5b2013-01-30 14:29:28 +0000577 if (isNamed)
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000578 Resolved = resolveHTMLNamedCharacterReference(Name);
579 else if (isDecimal)
580 Resolved = resolveHTMLDecimalCharacterReference(Name);
581 else
582 Resolved = resolveHTMLHexCharacterReference(Name);
583
584 if (Resolved.empty()) {
585 formTextToken(T, TokenPtr);
586 return;
587 }
588 formTokenWithChars(T, TokenPtr, tok::text);
589 T.setText(Resolved);
590 return;
591}
592
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000593void Lexer::setupAndLexHTMLStartTag(Token &T) {
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000594 assert(BufferPtr[0] == '<' &&
595 isHTMLIdentifierStartingCharacter(BufferPtr[1]));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000596 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000597 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000598 if (!isHTMLTagName(Name)) {
599 formTextToken(T, TagNameEnd);
600 return;
601 }
602
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000603 formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
604 T.setHTMLTagStartName(Name);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000605
606 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
607
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000608 const char C = *BufferPtr;
609 if (BufferPtr != CommentEnd &&
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000610 (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000611 State = LS_HTMLStartTag;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000612}
613
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000614void Lexer::lexHTMLStartTag(Token &T) {
615 assert(State == LS_HTMLStartTag);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000616
617 const char *TokenPtr = BufferPtr;
618 char C = *TokenPtr;
619 if (isHTMLIdentifierCharacter(C)) {
620 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000621 StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000622 formTokenWithChars(T, TokenPtr, tok::html_ident);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000623 T.setHTMLIdent(Ident);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000624 } else {
625 switch (C) {
626 case '=':
627 TokenPtr++;
628 formTokenWithChars(T, TokenPtr, tok::html_equals);
629 break;
630 case '\"':
631 case '\'': {
632 const char *OpenQuote = TokenPtr;
633 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
634 const char *ClosingQuote = TokenPtr;
635 if (TokenPtr != CommentEnd) // Skip closing quote.
636 TokenPtr++;
637 formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
638 T.setHTMLQuotedString(StringRef(OpenQuote + 1,
639 ClosingQuote - (OpenQuote + 1)));
640 break;
641 }
642 case '>':
643 TokenPtr++;
644 formTokenWithChars(T, TokenPtr, tok::html_greater);
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000645 State = LS_Normal;
646 return;
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000647 case '/':
648 TokenPtr++;
649 if (TokenPtr != CommentEnd && *TokenPtr == '>') {
650 TokenPtr++;
651 formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000652 } else
653 formTextToken(T, TokenPtr);
654
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000655 State = LS_Normal;
656 return;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000657 }
658 }
659
660 // Now look ahead and return to normal state if we don't see any HTML tokens
661 // ahead.
662 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
663 if (BufferPtr == CommentEnd) {
664 State = LS_Normal;
665 return;
666 }
667
668 C = *BufferPtr;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000669 if (!isHTMLIdentifierStartingCharacter(C) &&
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000670 C != '=' && C != '\"' && C != '\'' && C != '>') {
671 State = LS_Normal;
672 return;
673 }
674}
675
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000676void Lexer::setupAndLexHTMLEndTag(Token &T) {
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000677 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
678
679 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
680 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000681 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
682 if (!isHTMLTagName(Name)) {
683 formTextToken(T, TagNameEnd);
684 return;
685 }
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000686
687 const char *End = skipWhitespace(TagNameEnd, CommentEnd);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000688
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000689 formTokenWithChars(T, End, tok::html_end_tag);
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000690 T.setHTMLTagEndName(Name);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000691
692 if (BufferPtr != CommentEnd && *BufferPtr == '>')
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000693 State = LS_HTMLEndTag;
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000694}
695
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000696void Lexer::lexHTMLEndTag(Token &T) {
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000697 assert(BufferPtr != CommentEnd && *BufferPtr == '>');
698
699 formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
700 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000701}
702
Dmitri Gribenkoaa580812012-08-09 00:03:17 +0000703Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
Dmitri Gribenkoaf503a62012-08-31 10:35:30 +0000704 SourceLocation FileLoc,
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000705 const char *BufferStart, const char *BufferEnd):
Dmitri Gribenkoaa580812012-08-09 00:03:17 +0000706 Allocator(Allocator), Traits(Traits),
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000707 BufferStart(BufferStart), BufferEnd(BufferEnd),
Dmitri Gribenkoaf503a62012-08-31 10:35:30 +0000708 FileLoc(FileLoc), BufferPtr(BufferStart),
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000709 CommentState(LCS_BeforeComment), State(LS_Normal) {
710}
711
712void Lexer::lex(Token &T) {
713again:
714 switch (CommentState) {
715 case LCS_BeforeComment:
716 if (BufferPtr == BufferEnd) {
717 formTokenWithChars(T, BufferPtr, tok::eof);
718 return;
719 }
720
721 assert(*BufferPtr == '/');
722 BufferPtr++; // Skip first slash.
723 switch(*BufferPtr) {
724 case '/': { // BCPL comment.
725 BufferPtr++; // Skip second slash.
726
727 if (BufferPtr != BufferEnd) {
728 // Skip Doxygen magic marker, if it is present.
729 // It might be missing because of a typo //< or /*<, or because we
730 // merged this non-Doxygen comment into a bunch of Doxygen comments
731 // around it: /** ... */ /* ... */ /** ... */
732 const char C = *BufferPtr;
733 if (C == '/' || C == '!')
734 BufferPtr++;
735 }
736
737 // Skip less-than symbol that marks trailing comments.
738 // Skip it even if the comment is not a Doxygen one, because //< and /*<
739 // are frequent typos.
740 if (BufferPtr != BufferEnd && *BufferPtr == '<')
741 BufferPtr++;
742
743 CommentState = LCS_InsideBCPLComment;
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000744 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
745 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000746 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
747 goto again;
748 }
749 case '*': { // C comment.
750 BufferPtr++; // Skip star.
751
752 // Skip Doxygen magic marker.
753 const char C = *BufferPtr;
754 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
755 BufferPtr++;
756
757 // Skip less-than symbol that marks trailing comments.
758 if (BufferPtr != BufferEnd && *BufferPtr == '<')
759 BufferPtr++;
760
761 CommentState = LCS_InsideCComment;
762 State = LS_Normal;
763 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
764 goto again;
765 }
766 default:
767 llvm_unreachable("second character of comment should be '/' or '*'");
768 }
769
770 case LCS_BetweenComments: {
771 // Consecutive comments are extracted only if there is only whitespace
772 // between them. So we can search for the start of the next comment.
773 const char *EndWhitespace = BufferPtr;
774 while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
775 EndWhitespace++;
776
777 // Turn any whitespace between comments (and there is only whitespace
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000778 // between them -- guaranteed by comment extraction) into a newline. We
779 // have two newlines between C comments in total (first one was synthesized
780 // after a comment).
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000781 formTokenWithChars(T, EndWhitespace, tok::newline);
782
783 CommentState = LCS_BeforeComment;
784 break;
785 }
786
787 case LCS_InsideBCPLComment:
788 case LCS_InsideCComment:
789 if (BufferPtr != CommentEnd) {
790 lexCommentText(T);
791 break;
792 } else {
793 // Skip C comment closing sequence.
794 if (CommentState == LCS_InsideCComment) {
795 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
796 BufferPtr += 2;
797 assert(BufferPtr <= BufferEnd);
798
799 // Synthenize newline just after the C comment, regardless if there is
800 // actually a newline.
801 formTokenWithChars(T, BufferPtr, tok::newline);
802
803 CommentState = LCS_BetweenComments;
804 break;
805 } else {
806 // Don't synthesized a newline after BCPL comment.
807 CommentState = LCS_BetweenComments;
808 goto again;
809 }
810 }
811 }
812}
813
814StringRef Lexer::getSpelling(const Token &Tok,
815 const SourceManager &SourceMgr,
816 bool *Invalid) const {
817 SourceLocation Loc = Tok.getLocation();
818 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
819
820 bool InvalidTemp = false;
821 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
822 if (InvalidTemp) {
823 *Invalid = true;
824 return StringRef();
825 }
826
827 const char *Begin = File.data() + LocInfo.second;
828 return StringRef(Begin, Tok.getLength());
829}
830
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000831} // end namespace comments
832} // end namespace clang
833