blob: 1f4955d1cf2646905853dc85b7c823be975ac88c [file] [log] [blame]
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00001#include "clang/AST/CommentLexer.h"
2#include "llvm/ADT/StringSwitch.h"
3#include "llvm/Support/ErrorHandling.h"
4
5namespace clang {
6namespace comments {
7
8void Token::dump(const Lexer &L, const SourceManager &SM) const {
9 llvm::errs() << "comments::Token Kind=" << Kind << " ";
10 Loc.dump(SM);
11 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
12}
13
14bool Lexer::isVerbatimBlockCommand(StringRef BeginName,
15 StringRef &EndName) const {
16 const char *Result = llvm::StringSwitch<const char *>(BeginName)
17 .Case("code", "endcode")
18 .Case("verbatim", "endverbatim")
19 .Case("htmlonly", "endhtmlonly")
20 .Case("latexonly", "endlatexonly")
21 .Case("xmlonly", "endxmlonly")
22 .Case("manonly", "endmanonly")
23 .Case("rtfonly", "endrtfonly")
24
25 .Case("dot", "enddot")
26 .Case("msc", "endmsc")
27
28 .Case("f$", "f$") // Inline LaTeX formula
29 .Case("f[", "f]") // Displayed LaTeX formula
30 .Case("f{", "f}") // LaTeX environment
31
32 .Default(NULL);
33
34 if (Result) {
35 EndName = Result;
36 return true;
37 }
38
39 for (VerbatimBlockCommandVector::const_iterator
40 I = VerbatimBlockCommands.begin(),
41 E = VerbatimBlockCommands.end();
42 I != E; ++I)
43 if (I->BeginName == BeginName) {
44 EndName = I->EndName;
45 return true;
46 }
47
48 return false;
49}
50
51bool Lexer::isVerbatimLineCommand(StringRef Name) const {
52 bool Result = llvm::StringSwitch<bool>(Name)
53 .Case("fn", true)
54 .Case("var", true)
55 .Case("property", true)
56 .Case("typedef", true)
57
58 .Case("overload", true)
59
60 .Case("defgroup", true)
61 .Case("ingroup", true)
62 .Case("addtogroup", true)
63 .Case("weakgroup", true)
64 .Case("name", true)
65
66 .Case("section", true)
67 .Case("subsection", true)
68 .Case("subsubsection", true)
69 .Case("paragraph", true)
70
71 .Case("mainpage", true)
72 .Case("subpage", true)
73 .Case("ref", true)
74
75 .Default(false);
76
77 if (Result)
78 return true;
79
80 for (VerbatimLineCommandVector::const_iterator
81 I = VerbatimLineCommands.begin(),
82 E = VerbatimLineCommands.end();
83 I != E; ++I)
84 if (I->Name == Name)
85 return true;
86
87 return false;
88}
89
90void Lexer::skipLineStartingDecorations() {
91 // This function should be called only for C comments
92 assert(CommentState == LCS_InsideCComment);
93
94 if (BufferPtr == CommentEnd)
95 return;
96
97 switch (*BufferPtr) {
98 case ' ':
99 case '\t':
100 case '\f':
101 case '\v': {
102 const char *NewBufferPtr = BufferPtr;
103 NewBufferPtr++;
104 if (NewBufferPtr == CommentEnd)
105 return;
106
107 char C = *NewBufferPtr;
108 while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
109 NewBufferPtr++;
110 if (NewBufferPtr == CommentEnd)
111 return;
112 C = *NewBufferPtr;
113 }
114 if (C == '*')
115 BufferPtr = NewBufferPtr + 1;
116 break;
117 }
118 case '*':
119 BufferPtr++;
120 break;
121 }
122}
123
124namespace {
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000125/// Returns pointer to the first newline character in the string.
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000126const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
127 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
128 const char C = *BufferPtr;
129 if (C == '\n' || C == '\r')
130 return BufferPtr;
131 }
132 return BufferEnd;
133}
134
135const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
136 if (BufferPtr == BufferEnd)
137 return BufferPtr;
138
139 if (*BufferPtr == '\n')
140 BufferPtr++;
141 else {
142 assert(*BufferPtr == '\r');
143 BufferPtr++;
144 if (BufferPtr != BufferEnd && *BufferPtr == '\n')
145 BufferPtr++;
146 }
147 return BufferPtr;
148}
149
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000150bool isHTMLIdentifierStartingCharacter(char C) {
151 return (C >= 'a' && C <= 'z') ||
152 (C >= 'A' && C <= 'Z');
153}
154
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000155bool isHTMLIdentifierCharacter(char C) {
156 return (C >= 'a' && C <= 'z') ||
157 (C >= 'A' && C <= 'Z') ||
158 (C >= '0' && C <= '9');
159}
160
161const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
162 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
163 if (!isHTMLIdentifierCharacter(*BufferPtr))
164 return BufferPtr;
165 }
166 return BufferEnd;
167}
168
169/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
170/// string allowed.
171///
172/// Returns pointer to closing quote.
173const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
174{
175 const char Quote = *BufferPtr;
176 assert(Quote == '\"' || Quote == '\'');
177
178 BufferPtr++;
179 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
180 const char C = *BufferPtr;
181 if (C == Quote && BufferPtr[-1] != '\\')
182 return BufferPtr;
183 }
184 return BufferEnd;
185}
186
187bool isHorizontalWhitespace(char C) {
188 return C == ' ' || C == '\t' || C == '\f' || C == '\v';
189}
190
191bool isWhitespace(char C) {
192 return C == ' ' || C == '\n' || C == '\r' ||
193 C == '\t' || C == '\f' || C == '\v';
194}
195
196const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
197 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
198 if (!isWhitespace(*BufferPtr))
199 return BufferPtr;
200 }
201 return BufferEnd;
202}
203
204bool isCommandNameCharacter(char C) {
205 return (C >= 'a' && C <= 'z') ||
206 (C >= 'A' && C <= 'Z') ||
207 (C >= '0' && C <= '9');
208}
209
210const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
211 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
212 if (!isCommandNameCharacter(*BufferPtr))
213 return BufferPtr;
214 }
215 return BufferEnd;
216}
217
218/// Return the one past end pointer for BCPL comments.
219/// Handles newlines escaped with backslash or trigraph for backslahs.
220const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
221 const char *CurPtr = BufferPtr;
222 while (CurPtr != BufferEnd) {
223 char C = *CurPtr;
224 while (C != '\n' && C != '\r') {
225 CurPtr++;
226 if (CurPtr == BufferEnd)
227 return BufferEnd;
228 C = *CurPtr;
229 }
230 // We found a newline, check if it is escaped.
231 const char *EscapePtr = CurPtr - 1;
232 while(isHorizontalWhitespace(*EscapePtr))
233 EscapePtr--;
234
235 if (*EscapePtr == '\\' ||
236 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
237 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
238 // We found an escaped newline.
239 CurPtr = skipNewline(CurPtr, BufferEnd);
240 } else
241 return CurPtr; // Not an escaped newline.
242 }
243 return BufferEnd;
244}
245
246/// Return the one past end pointer for C comments.
247/// Very dumb, does not handle escaped newlines or trigraphs.
248const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
249 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
250 if (*BufferPtr == '*') {
251 assert(BufferPtr + 1 != BufferEnd);
252 if (*(BufferPtr + 1) == '/')
253 return BufferPtr;
254 }
255 }
256 llvm_unreachable("buffer end hit before '*/' was seen");
257}
258} // unnamed namespace
259
260void Lexer::lexCommentText(Token &T) {
261 assert(CommentState == LCS_InsideBCPLComment ||
262 CommentState == LCS_InsideCComment);
263
264 switch (State) {
265 case LS_Normal:
266 break;
267 case LS_VerbatimBlockFirstLine:
268 lexVerbatimBlockFirstLine(T);
269 return;
270 case LS_VerbatimBlockBody:
271 lexVerbatimBlockBody(T);
272 return;
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000273 case LS_VerbatimLineText:
274 lexVerbatimLineText(T);
275 return;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000276 case LS_HTMLOpenTag:
277 lexHTMLOpenTag(T);
278 return;
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000279 case LS_HTMLCloseTag:
280 lexHTMLCloseTag(T);
281 return;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000282 }
283
284 assert(State == LS_Normal);
285
286 const char *TokenPtr = BufferPtr;
287 assert(TokenPtr < CommentEnd);
288 while (TokenPtr != CommentEnd) {
289 switch(*TokenPtr) {
290 case '\\':
291 case '@': {
292 TokenPtr++;
293 if (TokenPtr == CommentEnd) {
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000294 StringRef Text(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000295 formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000296 T.setText(Text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000297 return;
298 }
299 char C = *TokenPtr;
300 switch (C) {
301 default:
302 break;
303
304 case '\\': case '@': case '&': case '$':
305 case '#': case '<': case '>': case '%':
306 case '\"': case '.': case ':':
307 // This is one of \\ \@ \& \$ etc escape sequences.
308 TokenPtr++;
309 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
310 // This is the \:: escape sequence.
311 TokenPtr++;
312 }
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000313 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000314 formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000315 T.setText(UnescapedText);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000316 return;
317 }
318
319 // Don't make zero-length commands.
320 if (!isCommandNameCharacter(*TokenPtr)) {
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000321 StringRef Text(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000322 formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000323 T.setText(Text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000324 return;
325 }
326
327 TokenPtr = skipCommandName(TokenPtr, CommentEnd);
328 unsigned Length = TokenPtr - (BufferPtr + 1);
329
330 // Hardcoded support for lexing LaTeX formula commands
331 // \f$ \f[ \f] \f{ \f} as a single command.
332 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
333 C = *TokenPtr;
334 if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
335 TokenPtr++;
336 Length++;
337 }
338 }
339
340 const StringRef CommandName(BufferPtr + 1, Length);
341 StringRef EndName;
342
343 if (isVerbatimBlockCommand(CommandName, EndName)) {
344 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, EndName);
345 return;
346 }
347 if (isVerbatimLineCommand(CommandName)) {
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000348 setupAndLexVerbatimLine(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000349 return;
350 }
351 formTokenWithChars(T, TokenPtr, tok::command);
352 T.setCommandName(CommandName);
353 return;
354 }
355
356 case '<': {
357 TokenPtr++;
358 if (TokenPtr == CommentEnd) {
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000359 StringRef Text(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000360 formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000361 T.setText(Text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000362 return;
363 }
364 const char C = *TokenPtr;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000365 if (isHTMLIdentifierStartingCharacter(C))
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000366 setupAndLexHTMLOpenTag(T);
367 else if (C == '/')
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000368 setupAndLexHTMLCloseTag(T);
Dmitri Gribenko5676d322012-06-27 23:28:29 +0000369 else {
370 StringRef Text(BufferPtr, TokenPtr - BufferPtr);
371 formTokenWithChars(T, TokenPtr, tok::text);
372 T.setText(Text);
373 }
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000374 return;
375 }
376
377 case '\n':
378 case '\r':
379 TokenPtr = skipNewline(TokenPtr, CommentEnd);
380 formTokenWithChars(T, TokenPtr, tok::newline);
381
382 if (CommentState == LCS_InsideCComment)
383 skipLineStartingDecorations();
384 return;
385
386 default: {
387 while (true) {
388 TokenPtr++;
389 if (TokenPtr == CommentEnd)
390 break;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000391 const char C = *TokenPtr;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000392 if(C == '\n' || C == '\r' ||
393 C == '\\' || C == '@' || C == '<')
394 break;
395 }
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000396 StringRef Text(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000397 formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000398 T.setText(Text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000399 return;
400 }
401 }
402 }
403}
404
405void Lexer::setupAndLexVerbatimBlock(Token &T,
406 const char *TextBegin,
407 char Marker, StringRef EndName) {
408 VerbatimBlockEndCommandName.clear();
409 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
410 VerbatimBlockEndCommandName.append(EndName);
411
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000412 StringRef Name(BufferPtr + 1, TextBegin - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000413 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000414 T.setVerbatimBlockName(Name);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000415
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000416 // If there is a newline following the verbatim opening command, skip the
417 // newline so that we don't create an tok::verbatim_block_line with empty
418 // text content.
419 if (BufferPtr != CommentEnd) {
420 const char C = *BufferPtr;
421 if (C == '\n' || C == '\r') {
422 BufferPtr = skipNewline(BufferPtr, CommentEnd);
423 State = LS_VerbatimBlockBody;
424 return;
425 }
426 }
427
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000428 State = LS_VerbatimBlockFirstLine;
429}
430
431void Lexer::lexVerbatimBlockFirstLine(Token &T) {
432 assert(BufferPtr < CommentEnd);
433
434 // FIXME: It would be better to scan the text once, finding either the block
435 // end command or newline.
436 //
437 // Extract current line.
438 const char *Newline = findNewline(BufferPtr, CommentEnd);
439 StringRef Line(BufferPtr, Newline - BufferPtr);
440
441 // Look for end command in current line.
442 size_t Pos = Line.find(VerbatimBlockEndCommandName);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000443 const char *TextEnd;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000444 const char *NextLine;
445 if (Pos == StringRef::npos) {
446 // Current line is completely verbatim.
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000447 TextEnd = Newline;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000448 NextLine = skipNewline(Newline, CommentEnd);
449 } else if (Pos == 0) {
450 // Current line contains just an end command.
451 const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000452 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000453 formTokenWithChars(T, End, tok::verbatim_block_end);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000454 T.setVerbatimBlockName(Name);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000455 State = LS_Normal;
456 return;
457 } else {
458 // There is some text, followed by end command. Extract text first.
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000459 TextEnd = BufferPtr + Pos;
460 NextLine = TextEnd;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000461 }
462
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000463 StringRef Text(BufferPtr, TextEnd - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000464 formTokenWithChars(T, NextLine, tok::verbatim_block_line);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000465 T.setVerbatimBlockText(Text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000466
467 State = LS_VerbatimBlockBody;
468}
469
470void Lexer::lexVerbatimBlockBody(Token &T) {
471 assert(State == LS_VerbatimBlockBody);
472
473 if (CommentState == LCS_InsideCComment)
474 skipLineStartingDecorations();
475
476 lexVerbatimBlockFirstLine(T);
477}
478
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000479void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin) {
480 const StringRef Name(BufferPtr + 1, TextBegin - BufferPtr - 1);
481 formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
482 T.setVerbatimLineName(Name);
483
484 State = LS_VerbatimLineText;
485}
486
487void Lexer::lexVerbatimLineText(Token &T) {
488 assert(State == LS_VerbatimLineText);
489
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000490 // Extract current line.
491 const char *Newline = findNewline(BufferPtr, CommentEnd);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000492 const StringRef Text(BufferPtr, Newline - BufferPtr);
493 formTokenWithChars(T, Newline, tok::verbatim_line_text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000494 T.setVerbatimLineText(Text);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000495
496 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000497}
498
499void Lexer::setupAndLexHTMLOpenTag(Token &T) {
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000500 assert(BufferPtr[0] == '<' &&
501 isHTMLIdentifierStartingCharacter(BufferPtr[1]));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000502 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
503
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000504 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000505 formTokenWithChars(T, TagNameEnd, tok::html_tag_open);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000506 T.setHTMLTagOpenName(Name);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000507
508 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
509
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000510 const char C = *BufferPtr;
511 if (BufferPtr != CommentEnd &&
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000512 (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000513 State = LS_HTMLOpenTag;
514}
515
516void Lexer::lexHTMLOpenTag(Token &T) {
517 assert(State == LS_HTMLOpenTag);
518
519 const char *TokenPtr = BufferPtr;
520 char C = *TokenPtr;
521 if (isHTMLIdentifierCharacter(C)) {
522 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000523 StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000524 formTokenWithChars(T, TokenPtr, tok::html_ident);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000525 T.setHTMLIdent(Ident);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000526 } else {
527 switch (C) {
528 case '=':
529 TokenPtr++;
530 formTokenWithChars(T, TokenPtr, tok::html_equals);
531 break;
532 case '\"':
533 case '\'': {
534 const char *OpenQuote = TokenPtr;
535 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
536 const char *ClosingQuote = TokenPtr;
537 if (TokenPtr != CommentEnd) // Skip closing quote.
538 TokenPtr++;
539 formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
540 T.setHTMLQuotedString(StringRef(OpenQuote + 1,
541 ClosingQuote - (OpenQuote + 1)));
542 break;
543 }
544 case '>':
545 TokenPtr++;
546 formTokenWithChars(T, TokenPtr, tok::html_greater);
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000547 State = LS_Normal;
548 return;
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000549 case '/':
550 TokenPtr++;
551 if (TokenPtr != CommentEnd && *TokenPtr == '>') {
552 TokenPtr++;
553 formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
554 } else {
555 StringRef Text(BufferPtr, TokenPtr - BufferPtr);
556 formTokenWithChars(T, TokenPtr, tok::text);
557 T.setText(Text);
558 }
559 State = LS_Normal;
560 return;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000561 }
562 }
563
564 // Now look ahead and return to normal state if we don't see any HTML tokens
565 // ahead.
566 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
567 if (BufferPtr == CommentEnd) {
568 State = LS_Normal;
569 return;
570 }
571
572 C = *BufferPtr;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000573 if (!isHTMLIdentifierStartingCharacter(C) &&
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000574 C != '=' && C != '\"' && C != '\'' && C != '>') {
575 State = LS_Normal;
576 return;
577 }
578}
579
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000580void Lexer::setupAndLexHTMLCloseTag(Token &T) {
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000581 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
582
583 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
584 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
585
586 const char *End = skipWhitespace(TagNameEnd, CommentEnd);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000587
588 formTokenWithChars(T, End, tok::html_tag_close);
589 T.setHTMLTagCloseName(StringRef(TagNameBegin, TagNameEnd - TagNameBegin));
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000590
591 if (BufferPtr != CommentEnd && *BufferPtr == '>')
592 State = LS_HTMLCloseTag;
593}
594
595void Lexer::lexHTMLCloseTag(Token &T) {
596 assert(BufferPtr != CommentEnd && *BufferPtr == '>');
597
598 formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
599 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000600}
601
602Lexer::Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts,
603 const char *BufferStart, const char *BufferEnd):
604 BufferStart(BufferStart), BufferEnd(BufferEnd),
605 FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart),
606 CommentState(LCS_BeforeComment), State(LS_Normal) {
607}
608
609void Lexer::lex(Token &T) {
610again:
611 switch (CommentState) {
612 case LCS_BeforeComment:
613 if (BufferPtr == BufferEnd) {
614 formTokenWithChars(T, BufferPtr, tok::eof);
615 return;
616 }
617
618 assert(*BufferPtr == '/');
619 BufferPtr++; // Skip first slash.
620 switch(*BufferPtr) {
621 case '/': { // BCPL comment.
622 BufferPtr++; // Skip second slash.
623
624 if (BufferPtr != BufferEnd) {
625 // Skip Doxygen magic marker, if it is present.
626 // It might be missing because of a typo //< or /*<, or because we
627 // merged this non-Doxygen comment into a bunch of Doxygen comments
628 // around it: /** ... */ /* ... */ /** ... */
629 const char C = *BufferPtr;
630 if (C == '/' || C == '!')
631 BufferPtr++;
632 }
633
634 // Skip less-than symbol that marks trailing comments.
635 // Skip it even if the comment is not a Doxygen one, because //< and /*<
636 // are frequent typos.
637 if (BufferPtr != BufferEnd && *BufferPtr == '<')
638 BufferPtr++;
639
640 CommentState = LCS_InsideBCPLComment;
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000641 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
642 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000643 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
644 goto again;
645 }
646 case '*': { // C comment.
647 BufferPtr++; // Skip star.
648
649 // Skip Doxygen magic marker.
650 const char C = *BufferPtr;
651 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
652 BufferPtr++;
653
654 // Skip less-than symbol that marks trailing comments.
655 if (BufferPtr != BufferEnd && *BufferPtr == '<')
656 BufferPtr++;
657
658 CommentState = LCS_InsideCComment;
659 State = LS_Normal;
660 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
661 goto again;
662 }
663 default:
664 llvm_unreachable("second character of comment should be '/' or '*'");
665 }
666
667 case LCS_BetweenComments: {
668 // Consecutive comments are extracted only if there is only whitespace
669 // between them. So we can search for the start of the next comment.
670 const char *EndWhitespace = BufferPtr;
671 while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
672 EndWhitespace++;
673
674 // Turn any whitespace between comments (and there is only whitespace
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000675 // between them -- guaranteed by comment extraction) into a newline. We
676 // have two newlines between C comments in total (first one was synthesized
677 // after a comment).
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000678 formTokenWithChars(T, EndWhitespace, tok::newline);
679
680 CommentState = LCS_BeforeComment;
681 break;
682 }
683
684 case LCS_InsideBCPLComment:
685 case LCS_InsideCComment:
686 if (BufferPtr != CommentEnd) {
687 lexCommentText(T);
688 break;
689 } else {
690 // Skip C comment closing sequence.
691 if (CommentState == LCS_InsideCComment) {
692 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
693 BufferPtr += 2;
694 assert(BufferPtr <= BufferEnd);
695
696 // Synthenize newline just after the C comment, regardless if there is
697 // actually a newline.
698 formTokenWithChars(T, BufferPtr, tok::newline);
699
700 CommentState = LCS_BetweenComments;
701 break;
702 } else {
703 // Don't synthesized a newline after BCPL comment.
704 CommentState = LCS_BetweenComments;
705 goto again;
706 }
707 }
708 }
709}
710
711StringRef Lexer::getSpelling(const Token &Tok,
712 const SourceManager &SourceMgr,
713 bool *Invalid) const {
714 SourceLocation Loc = Tok.getLocation();
715 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
716
717 bool InvalidTemp = false;
718 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
719 if (InvalidTemp) {
720 *Invalid = true;
721 return StringRef();
722 }
723
724 const char *Begin = File.data() + LocInfo.second;
725 return StringRef(Begin, Tok.getLength());
726}
727
728void Lexer::addVerbatimBlockCommand(StringRef BeginName, StringRef EndName) {
729 VerbatimBlockCommand VBC;
730 VBC.BeginName = BeginName;
731 VBC.EndName = EndName;
732 VerbatimBlockCommands.push_back(VBC);
733}
734
735void Lexer::addVerbatimLineCommand(StringRef Name) {
736 VerbatimLineCommand VLC;
737 VLC.Name = Name;
738 VerbatimLineCommands.push_back(VLC);
739}
740
741} // end namespace comments
742} // end namespace clang
743