blob: e5529dad15142bd7bcec2a3d344bfd04c593055a [file] [log] [blame]
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00001#include "clang/AST/CommentLexer.h"
2#include "llvm/ADT/StringSwitch.h"
3#include "llvm/Support/ErrorHandling.h"
4
5namespace clang {
6namespace comments {
7
8void Token::dump(const Lexer &L, const SourceManager &SM) const {
9 llvm::errs() << "comments::Token Kind=" << Kind << " ";
10 Loc.dump(SM);
11 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
12}
13
14bool Lexer::isVerbatimBlockCommand(StringRef BeginName,
15 StringRef &EndName) const {
16 const char *Result = llvm::StringSwitch<const char *>(BeginName)
17 .Case("code", "endcode")
18 .Case("verbatim", "endverbatim")
19 .Case("htmlonly", "endhtmlonly")
20 .Case("latexonly", "endlatexonly")
21 .Case("xmlonly", "endxmlonly")
22 .Case("manonly", "endmanonly")
23 .Case("rtfonly", "endrtfonly")
24
25 .Case("dot", "enddot")
26 .Case("msc", "endmsc")
27
28 .Case("f$", "f$") // Inline LaTeX formula
29 .Case("f[", "f]") // Displayed LaTeX formula
30 .Case("f{", "f}") // LaTeX environment
31
32 .Default(NULL);
33
34 if (Result) {
35 EndName = Result;
36 return true;
37 }
38
39 for (VerbatimBlockCommandVector::const_iterator
40 I = VerbatimBlockCommands.begin(),
41 E = VerbatimBlockCommands.end();
42 I != E; ++I)
43 if (I->BeginName == BeginName) {
44 EndName = I->EndName;
45 return true;
46 }
47
48 return false;
49}
50
51bool Lexer::isVerbatimLineCommand(StringRef Name) const {
52 bool Result = llvm::StringSwitch<bool>(Name)
53 .Case("fn", true)
54 .Case("var", true)
55 .Case("property", true)
56 .Case("typedef", true)
57
58 .Case("overload", true)
59
60 .Case("defgroup", true)
61 .Case("ingroup", true)
62 .Case("addtogroup", true)
63 .Case("weakgroup", true)
64 .Case("name", true)
65
66 .Case("section", true)
67 .Case("subsection", true)
68 .Case("subsubsection", true)
69 .Case("paragraph", true)
70
71 .Case("mainpage", true)
72 .Case("subpage", true)
73 .Case("ref", true)
74
75 .Default(false);
76
77 if (Result)
78 return true;
79
80 for (VerbatimLineCommandVector::const_iterator
81 I = VerbatimLineCommands.begin(),
82 E = VerbatimLineCommands.end();
83 I != E; ++I)
84 if (I->Name == Name)
85 return true;
86
87 return false;
88}
89
90void Lexer::skipLineStartingDecorations() {
91 // This function should be called only for C comments
92 assert(CommentState == LCS_InsideCComment);
93
94 if (BufferPtr == CommentEnd)
95 return;
96
97 switch (*BufferPtr) {
98 case ' ':
99 case '\t':
100 case '\f':
101 case '\v': {
102 const char *NewBufferPtr = BufferPtr;
103 NewBufferPtr++;
104 if (NewBufferPtr == CommentEnd)
105 return;
106
107 char C = *NewBufferPtr;
108 while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
109 NewBufferPtr++;
110 if (NewBufferPtr == CommentEnd)
111 return;
112 C = *NewBufferPtr;
113 }
114 if (C == '*')
115 BufferPtr = NewBufferPtr + 1;
116 break;
117 }
118 case '*':
119 BufferPtr++;
120 break;
121 }
122}
123
124namespace {
125const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
126 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
127 const char C = *BufferPtr;
128 if (C == '\n' || C == '\r')
129 return BufferPtr;
130 }
131 return BufferEnd;
132}
133
134const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
135 if (BufferPtr == BufferEnd)
136 return BufferPtr;
137
138 if (*BufferPtr == '\n')
139 BufferPtr++;
140 else {
141 assert(*BufferPtr == '\r');
142 BufferPtr++;
143 if (BufferPtr != BufferEnd && *BufferPtr == '\n')
144 BufferPtr++;
145 }
146 return BufferPtr;
147}
148
149bool isHTMLIdentifierCharacter(char C) {
150 return (C >= 'a' && C <= 'z') ||
151 (C >= 'A' && C <= 'Z') ||
152 (C >= '0' && C <= '9');
153}
154
155const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
156 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
157 if (!isHTMLIdentifierCharacter(*BufferPtr))
158 return BufferPtr;
159 }
160 return BufferEnd;
161}
162
163/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
164/// string allowed.
165///
166/// Returns pointer to closing quote.
167const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
168{
169 const char Quote = *BufferPtr;
170 assert(Quote == '\"' || Quote == '\'');
171
172 BufferPtr++;
173 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
174 const char C = *BufferPtr;
175 if (C == Quote && BufferPtr[-1] != '\\')
176 return BufferPtr;
177 }
178 return BufferEnd;
179}
180
181bool isHorizontalWhitespace(char C) {
182 return C == ' ' || C == '\t' || C == '\f' || C == '\v';
183}
184
185bool isWhitespace(char C) {
186 return C == ' ' || C == '\n' || C == '\r' ||
187 C == '\t' || C == '\f' || C == '\v';
188}
189
190const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
191 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
192 if (!isWhitespace(*BufferPtr))
193 return BufferPtr;
194 }
195 return BufferEnd;
196}
197
198bool isCommandNameCharacter(char C) {
199 return (C >= 'a' && C <= 'z') ||
200 (C >= 'A' && C <= 'Z') ||
201 (C >= '0' && C <= '9');
202}
203
204const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
205 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
206 if (!isCommandNameCharacter(*BufferPtr))
207 return BufferPtr;
208 }
209 return BufferEnd;
210}
211
212/// Return the one past end pointer for BCPL comments.
213/// Handles newlines escaped with backslash or trigraph for backslahs.
214const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
215 const char *CurPtr = BufferPtr;
216 while (CurPtr != BufferEnd) {
217 char C = *CurPtr;
218 while (C != '\n' && C != '\r') {
219 CurPtr++;
220 if (CurPtr == BufferEnd)
221 return BufferEnd;
222 C = *CurPtr;
223 }
224 // We found a newline, check if it is escaped.
225 const char *EscapePtr = CurPtr - 1;
226 while(isHorizontalWhitespace(*EscapePtr))
227 EscapePtr--;
228
229 if (*EscapePtr == '\\' ||
230 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
231 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
232 // We found an escaped newline.
233 CurPtr = skipNewline(CurPtr, BufferEnd);
234 } else
235 return CurPtr; // Not an escaped newline.
236 }
237 return BufferEnd;
238}
239
240/// Return the one past end pointer for C comments.
241/// Very dumb, does not handle escaped newlines or trigraphs.
242const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
243 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
244 if (*BufferPtr == '*') {
245 assert(BufferPtr + 1 != BufferEnd);
246 if (*(BufferPtr + 1) == '/')
247 return BufferPtr;
248 }
249 }
250 llvm_unreachable("buffer end hit before '*/' was seen");
251}
252} // unnamed namespace
253
254void Lexer::lexCommentText(Token &T) {
255 assert(CommentState == LCS_InsideBCPLComment ||
256 CommentState == LCS_InsideCComment);
257
258 switch (State) {
259 case LS_Normal:
260 break;
261 case LS_VerbatimBlockFirstLine:
262 lexVerbatimBlockFirstLine(T);
263 return;
264 case LS_VerbatimBlockBody:
265 lexVerbatimBlockBody(T);
266 return;
267 case LS_HTMLOpenTag:
268 lexHTMLOpenTag(T);
269 return;
270 }
271
272 assert(State == LS_Normal);
273
274 const char *TokenPtr = BufferPtr;
275 assert(TokenPtr < CommentEnd);
276 while (TokenPtr != CommentEnd) {
277 switch(*TokenPtr) {
278 case '\\':
279 case '@': {
280 TokenPtr++;
281 if (TokenPtr == CommentEnd) {
282 formTokenWithChars(T, TokenPtr, tok::text);
283 T.setText(StringRef(BufferPtr - T.getLength(), T.getLength()));
284 return;
285 }
286 char C = *TokenPtr;
287 switch (C) {
288 default:
289 break;
290
291 case '\\': case '@': case '&': case '$':
292 case '#': case '<': case '>': case '%':
293 case '\"': case '.': case ':':
294 // This is one of \\ \@ \& \$ etc escape sequences.
295 TokenPtr++;
296 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
297 // This is the \:: escape sequence.
298 TokenPtr++;
299 }
300 formTokenWithChars(T, TokenPtr, tok::text);
301 T.setText(StringRef(BufferPtr - (T.getLength() - 1),
302 T.getLength() - 1));
303 return;
304 }
305
306 // Don't make zero-length commands.
307 if (!isCommandNameCharacter(*TokenPtr)) {
308 formTokenWithChars(T, TokenPtr, tok::text);
309 T.setText(StringRef(BufferPtr - T.getLength(), T.getLength()));
310 return;
311 }
312
313 TokenPtr = skipCommandName(TokenPtr, CommentEnd);
314 unsigned Length = TokenPtr - (BufferPtr + 1);
315
316 // Hardcoded support for lexing LaTeX formula commands
317 // \f$ \f[ \f] \f{ \f} as a single command.
318 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
319 C = *TokenPtr;
320 if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
321 TokenPtr++;
322 Length++;
323 }
324 }
325
326 const StringRef CommandName(BufferPtr + 1, Length);
327 StringRef EndName;
328
329 if (isVerbatimBlockCommand(CommandName, EndName)) {
330 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, EndName);
331 return;
332 }
333 if (isVerbatimLineCommand(CommandName)) {
334 lexVerbatimLine(T, TokenPtr);
335 return;
336 }
337 formTokenWithChars(T, TokenPtr, tok::command);
338 T.setCommandName(CommandName);
339 return;
340 }
341
342 case '<': {
343 TokenPtr++;
344 if (TokenPtr == CommentEnd) {
345 formTokenWithChars(T, TokenPtr, tok::text);
346 T.setText(StringRef(BufferPtr - T.getLength(), T.getLength()));
347 return;
348 }
349 const char C = *TokenPtr;
350 if (isHTMLIdentifierCharacter(C))
351 setupAndLexHTMLOpenTag(T);
352 else if (C == '/')
353 lexHTMLCloseTag(T);
354 return;
355 }
356
357 case '\n':
358 case '\r':
359 TokenPtr = skipNewline(TokenPtr, CommentEnd);
360 formTokenWithChars(T, TokenPtr, tok::newline);
361
362 if (CommentState == LCS_InsideCComment)
363 skipLineStartingDecorations();
364 return;
365
366 default: {
367 while (true) {
368 TokenPtr++;
369 if (TokenPtr == CommentEnd)
370 break;
371 char C = *TokenPtr;
372 if(C == '\n' || C == '\r' ||
373 C == '\\' || C == '@' || C == '<')
374 break;
375 }
376 formTokenWithChars(T, TokenPtr, tok::text);
377 T.setText(StringRef(BufferPtr - T.getLength(), T.getLength()));
378 return;
379 }
380 }
381 }
382}
383
384void Lexer::setupAndLexVerbatimBlock(Token &T,
385 const char *TextBegin,
386 char Marker, StringRef EndName) {
387 VerbatimBlockEndCommandName.clear();
388 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
389 VerbatimBlockEndCommandName.append(EndName);
390
391 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
392 T.setVerbatimBlockName(StringRef(TextBegin - (T.getLength() - 1),
393 T.getLength() - 1));
394
395 State = LS_VerbatimBlockFirstLine;
396}
397
398void Lexer::lexVerbatimBlockFirstLine(Token &T) {
399 assert(BufferPtr < CommentEnd);
400
401 // FIXME: It would be better to scan the text once, finding either the block
402 // end command or newline.
403 //
404 // Extract current line.
405 const char *Newline = findNewline(BufferPtr, CommentEnd);
406 StringRef Line(BufferPtr, Newline - BufferPtr);
407
408 // Look for end command in current line.
409 size_t Pos = Line.find(VerbatimBlockEndCommandName);
410 const char *NextLine;
411 if (Pos == StringRef::npos) {
412 // Current line is completely verbatim.
413 NextLine = skipNewline(Newline, CommentEnd);
414 } else if (Pos == 0) {
415 // Current line contains just an end command.
416 const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
417 formTokenWithChars(T, End, tok::verbatim_block_end);
418 T.setVerbatimBlockName(StringRef(End - (T.getLength() - 1),
419 T.getLength() - 1));
420 State = LS_Normal;
421 return;
422 } else {
423 // There is some text, followed by end command. Extract text first.
424 NextLine = BufferPtr + Pos;
425 }
426
427 formTokenWithChars(T, NextLine, tok::verbatim_block_line);
428 T.setVerbatimBlockText(StringRef(NextLine - T.getLength(), T.getLength()));
429
430 State = LS_VerbatimBlockBody;
431}
432
433void Lexer::lexVerbatimBlockBody(Token &T) {
434 assert(State == LS_VerbatimBlockBody);
435
436 if (CommentState == LCS_InsideCComment)
437 skipLineStartingDecorations();
438
439 lexVerbatimBlockFirstLine(T);
440}
441
442void Lexer::lexVerbatimLine(Token &T, const char *TextBegin) {
443 // Extract current line.
444 const char *Newline = findNewline(BufferPtr, CommentEnd);
445
446 const StringRef Name(BufferPtr + 1, TextBegin - BufferPtr - 1);
447 const StringRef Text(TextBegin, Newline - TextBegin);
448
449 formTokenWithChars(T, Newline, tok::verbatim_line);
450 T.setVerbatimLineName(Name);
451 T.setVerbatimLineText(Text);
452}
453
454void Lexer::setupAndLexHTMLOpenTag(Token &T) {
455 assert(BufferPtr[0] == '<' && isHTMLIdentifierCharacter(BufferPtr[1]));
456 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
457
458 formTokenWithChars(T, TagNameEnd, tok::html_tag_open);
459 T.setHTMLTagOpenName(StringRef(TagNameEnd - (T.getLength() - 1),
460 T.getLength() - 1));
461
462 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
463
464 if (BufferPtr != CommentEnd && *BufferPtr == '>') {
465 BufferPtr++;
466 return;
467 }
468
469 if (BufferPtr != CommentEnd && isHTMLIdentifierCharacter(*BufferPtr))
470 State = LS_HTMLOpenTag;
471}
472
473void Lexer::lexHTMLOpenTag(Token &T) {
474 assert(State == LS_HTMLOpenTag);
475
476 const char *TokenPtr = BufferPtr;
477 char C = *TokenPtr;
478 if (isHTMLIdentifierCharacter(C)) {
479 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
480 formTokenWithChars(T, TokenPtr, tok::html_ident);
481 T.setHTMLIdent(StringRef(TokenPtr - T.getLength(), T.getLength()));
482 } else {
483 switch (C) {
484 case '=':
485 TokenPtr++;
486 formTokenWithChars(T, TokenPtr, tok::html_equals);
487 break;
488 case '\"':
489 case '\'': {
490 const char *OpenQuote = TokenPtr;
491 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
492 const char *ClosingQuote = TokenPtr;
493 if (TokenPtr != CommentEnd) // Skip closing quote.
494 TokenPtr++;
495 formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
496 T.setHTMLQuotedString(StringRef(OpenQuote + 1,
497 ClosingQuote - (OpenQuote + 1)));
498 break;
499 }
500 case '>':
501 TokenPtr++;
502 formTokenWithChars(T, TokenPtr, tok::html_greater);
503 break;
504 }
505 }
506
507 // Now look ahead and return to normal state if we don't see any HTML tokens
508 // ahead.
509 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
510 if (BufferPtr == CommentEnd) {
511 State = LS_Normal;
512 return;
513 }
514
515 C = *BufferPtr;
516 if (!isHTMLIdentifierCharacter(C) &&
517 C != '=' && C != '\"' && C != '\'' && C != '>') {
518 State = LS_Normal;
519 return;
520 }
521}
522
523void Lexer::lexHTMLCloseTag(Token &T) {
524 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
525
526 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
527 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
528
529 const char *End = skipWhitespace(TagNameEnd, CommentEnd);
530 if (End != CommentEnd && *End == '>')
531 End++;
532
533 formTokenWithChars(T, End, tok::html_tag_close);
534 T.setHTMLTagCloseName(StringRef(TagNameBegin, TagNameEnd - TagNameBegin));
535}
536
537Lexer::Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts,
538 const char *BufferStart, const char *BufferEnd):
539 BufferStart(BufferStart), BufferEnd(BufferEnd),
540 FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart),
541 CommentState(LCS_BeforeComment), State(LS_Normal) {
542}
543
544void Lexer::lex(Token &T) {
545again:
546 switch (CommentState) {
547 case LCS_BeforeComment:
548 if (BufferPtr == BufferEnd) {
549 formTokenWithChars(T, BufferPtr, tok::eof);
550 return;
551 }
552
553 assert(*BufferPtr == '/');
554 BufferPtr++; // Skip first slash.
555 switch(*BufferPtr) {
556 case '/': { // BCPL comment.
557 BufferPtr++; // Skip second slash.
558
559 if (BufferPtr != BufferEnd) {
560 // Skip Doxygen magic marker, if it is present.
561 // It might be missing because of a typo //< or /*<, or because we
562 // merged this non-Doxygen comment into a bunch of Doxygen comments
563 // around it: /** ... */ /* ... */ /** ... */
564 const char C = *BufferPtr;
565 if (C == '/' || C == '!')
566 BufferPtr++;
567 }
568
569 // Skip less-than symbol that marks trailing comments.
570 // Skip it even if the comment is not a Doxygen one, because //< and /*<
571 // are frequent typos.
572 if (BufferPtr != BufferEnd && *BufferPtr == '<')
573 BufferPtr++;
574
575 CommentState = LCS_InsideBCPLComment;
576 State = LS_Normal;
577 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
578 goto again;
579 }
580 case '*': { // C comment.
581 BufferPtr++; // Skip star.
582
583 // Skip Doxygen magic marker.
584 const char C = *BufferPtr;
585 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
586 BufferPtr++;
587
588 // Skip less-than symbol that marks trailing comments.
589 if (BufferPtr != BufferEnd && *BufferPtr == '<')
590 BufferPtr++;
591
592 CommentState = LCS_InsideCComment;
593 State = LS_Normal;
594 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
595 goto again;
596 }
597 default:
598 llvm_unreachable("second character of comment should be '/' or '*'");
599 }
600
601 case LCS_BetweenComments: {
602 // Consecutive comments are extracted only if there is only whitespace
603 // between them. So we can search for the start of the next comment.
604 const char *EndWhitespace = BufferPtr;
605 while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
606 EndWhitespace++;
607
608 // Turn any whitespace between comments (and there is only whitespace
609 // between them) into a newline. We have two newlines between comments
610 // in total (first one was synthesized after a comment).
611 formTokenWithChars(T, EndWhitespace, tok::newline);
612
613 CommentState = LCS_BeforeComment;
614 break;
615 }
616
617 case LCS_InsideBCPLComment:
618 case LCS_InsideCComment:
619 if (BufferPtr != CommentEnd) {
620 lexCommentText(T);
621 break;
622 } else {
623 // Skip C comment closing sequence.
624 if (CommentState == LCS_InsideCComment) {
625 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
626 BufferPtr += 2;
627 assert(BufferPtr <= BufferEnd);
628
629 // Synthenize newline just after the C comment, regardless if there is
630 // actually a newline.
631 formTokenWithChars(T, BufferPtr, tok::newline);
632
633 CommentState = LCS_BetweenComments;
634 break;
635 } else {
636 // Don't synthesized a newline after BCPL comment.
637 CommentState = LCS_BetweenComments;
638 goto again;
639 }
640 }
641 }
642}
643
644StringRef Lexer::getSpelling(const Token &Tok,
645 const SourceManager &SourceMgr,
646 bool *Invalid) const {
647 SourceLocation Loc = Tok.getLocation();
648 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
649
650 bool InvalidTemp = false;
651 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
652 if (InvalidTemp) {
653 *Invalid = true;
654 return StringRef();
655 }
656
657 const char *Begin = File.data() + LocInfo.second;
658 return StringRef(Begin, Tok.getLength());
659}
660
661void Lexer::addVerbatimBlockCommand(StringRef BeginName, StringRef EndName) {
662 VerbatimBlockCommand VBC;
663 VBC.BeginName = BeginName;
664 VBC.EndName = EndName;
665 VerbatimBlockCommands.push_back(VBC);
666}
667
668void Lexer::addVerbatimLineCommand(StringRef Name) {
669 VerbatimLineCommand VLC;
670 VLC.Name = Name;
671 VerbatimLineCommands.push_back(VLC);
672}
673
674} // end namespace comments
675} // end namespace clang
676