blob: dde484510ff180acc4c9bd485ddaa6278c2d7b75 [file] [log] [blame]
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00001#include "clang/AST/CommentLexer.h"
Dmitri Gribenko477a9f52012-07-27 20:37:06 +00002#include "clang/Basic/ConvertUTF.h"
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00003#include "llvm/ADT/StringSwitch.h"
4#include "llvm/Support/ErrorHandling.h"
5
6namespace clang {
7namespace comments {
8
9void Token::dump(const Lexer &L, const SourceManager &SM) const {
10 llvm::errs() << "comments::Token Kind=" << Kind << " ";
11 Loc.dump(SM);
12 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
13}
14
15bool Lexer::isVerbatimBlockCommand(StringRef BeginName,
16 StringRef &EndName) const {
17 const char *Result = llvm::StringSwitch<const char *>(BeginName)
18 .Case("code", "endcode")
19 .Case("verbatim", "endverbatim")
20 .Case("htmlonly", "endhtmlonly")
21 .Case("latexonly", "endlatexonly")
22 .Case("xmlonly", "endxmlonly")
23 .Case("manonly", "endmanonly")
24 .Case("rtfonly", "endrtfonly")
25
26 .Case("dot", "enddot")
27 .Case("msc", "endmsc")
28
29 .Case("f$", "f$") // Inline LaTeX formula
30 .Case("f[", "f]") // Displayed LaTeX formula
31 .Case("f{", "f}") // LaTeX environment
32
33 .Default(NULL);
34
35 if (Result) {
36 EndName = Result;
37 return true;
38 }
39
40 for (VerbatimBlockCommandVector::const_iterator
41 I = VerbatimBlockCommands.begin(),
42 E = VerbatimBlockCommands.end();
43 I != E; ++I)
44 if (I->BeginName == BeginName) {
45 EndName = I->EndName;
46 return true;
47 }
48
49 return false;
50}
51
52bool Lexer::isVerbatimLineCommand(StringRef Name) const {
53 bool Result = llvm::StringSwitch<bool>(Name)
54 .Case("fn", true)
55 .Case("var", true)
56 .Case("property", true)
57 .Case("typedef", true)
58
59 .Case("overload", true)
60
61 .Case("defgroup", true)
62 .Case("ingroup", true)
63 .Case("addtogroup", true)
64 .Case("weakgroup", true)
65 .Case("name", true)
66
67 .Case("section", true)
68 .Case("subsection", true)
69 .Case("subsubsection", true)
70 .Case("paragraph", true)
71
72 .Case("mainpage", true)
73 .Case("subpage", true)
74 .Case("ref", true)
75
76 .Default(false);
77
78 if (Result)
79 return true;
80
81 for (VerbatimLineCommandVector::const_iterator
82 I = VerbatimLineCommands.begin(),
83 E = VerbatimLineCommands.end();
84 I != E; ++I)
85 if (I->Name == Name)
86 return true;
87
88 return false;
89}
90
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000091namespace {
92bool isHTMLNamedCharacterReferenceCharacter(char C) {
93 return (C >= 'a' && C <= 'z') ||
94 (C >= 'A' && C <= 'Z');
95}
96
97bool isHTMLDecimalCharacterReferenceCharacter(char C) {
98 return C >= '0' && C <= '9';
99}
100
101bool isHTMLHexCharacterReferenceCharacter(char C) {
102 return (C >= '0' && C <= '9') ||
103 (C >= 'a' && C <= 'f') ||
104 (C >= 'A' && C <= 'F');
105}
106} // unnamed namespace
107
108StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
109 return llvm::StringSwitch<StringRef>(Name)
110 .Case("amp", "&")
111 .Case("lt", "<")
112 .Case("gt", ">")
113 .Case("quot", "\"")
114 .Case("apos", "\'")
115 .Default("");
116}
117
118StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
119 unsigned CodePoint = 0;
120 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
121 assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
122 CodePoint *= 10;
123 CodePoint += Name[i] - '0';
124 }
125
126 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
127 char *ResolvedPtr = Resolved;
128 if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
129 return StringRef(Resolved, ResolvedPtr - Resolved);
130 else
131 return StringRef();
132}
133
134StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
135 unsigned CodePoint = 0;
136 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
137 CodePoint *= 16;
138 const char C = Name[i];
139 assert(isHTMLHexCharacterReferenceCharacter(C));
140 if (C >= '0' && C <= '9')
141 CodePoint += Name[i] - '0';
142 else if (C >= 'a' && C <= 'f')
143 CodePoint += Name[i] - 'a' + 10;
144 else
145 CodePoint += Name[i] - 'A' + 10;
146 }
147
148 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
149 char *ResolvedPtr = Resolved;
150 if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
151 return StringRef(Resolved, ResolvedPtr - Resolved);
152 else
153 return StringRef();
154}
155
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000156void Lexer::skipLineStartingDecorations() {
157 // This function should be called only for C comments
158 assert(CommentState == LCS_InsideCComment);
159
160 if (BufferPtr == CommentEnd)
161 return;
162
163 switch (*BufferPtr) {
164 case ' ':
165 case '\t':
166 case '\f':
167 case '\v': {
168 const char *NewBufferPtr = BufferPtr;
169 NewBufferPtr++;
170 if (NewBufferPtr == CommentEnd)
171 return;
172
173 char C = *NewBufferPtr;
174 while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
175 NewBufferPtr++;
176 if (NewBufferPtr == CommentEnd)
177 return;
178 C = *NewBufferPtr;
179 }
180 if (C == '*')
181 BufferPtr = NewBufferPtr + 1;
182 break;
183 }
184 case '*':
185 BufferPtr++;
186 break;
187 }
188}
189
190namespace {
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000191/// Returns pointer to the first newline character in the string.
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000192const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
193 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
194 const char C = *BufferPtr;
195 if (C == '\n' || C == '\r')
196 return BufferPtr;
197 }
198 return BufferEnd;
199}
200
201const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
202 if (BufferPtr == BufferEnd)
203 return BufferPtr;
204
205 if (*BufferPtr == '\n')
206 BufferPtr++;
207 else {
208 assert(*BufferPtr == '\r');
209 BufferPtr++;
210 if (BufferPtr != BufferEnd && *BufferPtr == '\n')
211 BufferPtr++;
212 }
213 return BufferPtr;
214}
215
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000216const char *skipNamedCharacterReference(const char *BufferPtr,
217 const char *BufferEnd) {
218 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
219 if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
220 return BufferPtr;
221 }
222 return BufferEnd;
223}
224
225const char *skipDecimalCharacterReference(const char *BufferPtr,
226 const char *BufferEnd) {
227 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
228 if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
229 return BufferPtr;
230 }
231 return BufferEnd;
232}
233
234const char *skipHexCharacterReference(const char *BufferPtr,
235 const char *BufferEnd) {
236 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
237 if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
238 return BufferPtr;
239 }
240 return BufferEnd;
241}
242
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000243bool isHTMLIdentifierStartingCharacter(char C) {
244 return (C >= 'a' && C <= 'z') ||
245 (C >= 'A' && C <= 'Z');
246}
247
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000248bool isHTMLIdentifierCharacter(char C) {
249 return (C >= 'a' && C <= 'z') ||
250 (C >= 'A' && C <= 'Z') ||
251 (C >= '0' && C <= '9');
252}
253
254const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
255 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
256 if (!isHTMLIdentifierCharacter(*BufferPtr))
257 return BufferPtr;
258 }
259 return BufferEnd;
260}
261
262/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
263/// string allowed.
264///
265/// Returns pointer to closing quote.
266const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
267{
268 const char Quote = *BufferPtr;
269 assert(Quote == '\"' || Quote == '\'');
270
271 BufferPtr++;
272 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
273 const char C = *BufferPtr;
274 if (C == Quote && BufferPtr[-1] != '\\')
275 return BufferPtr;
276 }
277 return BufferEnd;
278}
279
280bool isHorizontalWhitespace(char C) {
281 return C == ' ' || C == '\t' || C == '\f' || C == '\v';
282}
283
284bool isWhitespace(char C) {
285 return C == ' ' || C == '\n' || C == '\r' ||
286 C == '\t' || C == '\f' || C == '\v';
287}
288
289const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
290 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
291 if (!isWhitespace(*BufferPtr))
292 return BufferPtr;
293 }
294 return BufferEnd;
295}
296
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000297bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
298 return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
299}
300
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000301bool isCommandNameCharacter(char C) {
302 return (C >= 'a' && C <= 'z') ||
303 (C >= 'A' && C <= 'Z') ||
304 (C >= '0' && C <= '9');
305}
306
307const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
308 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
309 if (!isCommandNameCharacter(*BufferPtr))
310 return BufferPtr;
311 }
312 return BufferEnd;
313}
314
315/// Return the one past end pointer for BCPL comments.
316/// Handles newlines escaped with backslash or trigraph for backslahs.
317const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
318 const char *CurPtr = BufferPtr;
319 while (CurPtr != BufferEnd) {
320 char C = *CurPtr;
321 while (C != '\n' && C != '\r') {
322 CurPtr++;
323 if (CurPtr == BufferEnd)
324 return BufferEnd;
325 C = *CurPtr;
326 }
327 // We found a newline, check if it is escaped.
328 const char *EscapePtr = CurPtr - 1;
329 while(isHorizontalWhitespace(*EscapePtr))
330 EscapePtr--;
331
332 if (*EscapePtr == '\\' ||
333 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
334 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
335 // We found an escaped newline.
336 CurPtr = skipNewline(CurPtr, BufferEnd);
337 } else
338 return CurPtr; // Not an escaped newline.
339 }
340 return BufferEnd;
341}
342
343/// Return the one past end pointer for C comments.
344/// Very dumb, does not handle escaped newlines or trigraphs.
345const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
346 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
347 if (*BufferPtr == '*') {
348 assert(BufferPtr + 1 != BufferEnd);
349 if (*(BufferPtr + 1) == '/')
350 return BufferPtr;
351 }
352 }
353 llvm_unreachable("buffer end hit before '*/' was seen");
354}
355} // unnamed namespace
356
357void Lexer::lexCommentText(Token &T) {
358 assert(CommentState == LCS_InsideBCPLComment ||
359 CommentState == LCS_InsideCComment);
360
361 switch (State) {
362 case LS_Normal:
363 break;
364 case LS_VerbatimBlockFirstLine:
365 lexVerbatimBlockFirstLine(T);
366 return;
367 case LS_VerbatimBlockBody:
368 lexVerbatimBlockBody(T);
369 return;
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000370 case LS_VerbatimLineText:
371 lexVerbatimLineText(T);
372 return;
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000373 case LS_HTMLStartTag:
374 lexHTMLStartTag(T);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000375 return;
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000376 case LS_HTMLEndTag:
377 lexHTMLEndTag(T);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000378 return;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000379 }
380
381 assert(State == LS_Normal);
382
383 const char *TokenPtr = BufferPtr;
384 assert(TokenPtr < CommentEnd);
385 while (TokenPtr != CommentEnd) {
386 switch(*TokenPtr) {
387 case '\\':
388 case '@': {
389 TokenPtr++;
390 if (TokenPtr == CommentEnd) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000391 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000392 return;
393 }
394 char C = *TokenPtr;
395 switch (C) {
396 default:
397 break;
398
399 case '\\': case '@': case '&': case '$':
400 case '#': case '<': case '>': case '%':
401 case '\"': case '.': case ':':
402 // This is one of \\ \@ \& \$ etc escape sequences.
403 TokenPtr++;
404 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
405 // This is the \:: escape sequence.
406 TokenPtr++;
407 }
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000408 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000409 formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000410 T.setText(UnescapedText);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000411 return;
412 }
413
414 // Don't make zero-length commands.
415 if (!isCommandNameCharacter(*TokenPtr)) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000416 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000417 return;
418 }
419
420 TokenPtr = skipCommandName(TokenPtr, CommentEnd);
421 unsigned Length = TokenPtr - (BufferPtr + 1);
422
423 // Hardcoded support for lexing LaTeX formula commands
424 // \f$ \f[ \f] \f{ \f} as a single command.
425 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
426 C = *TokenPtr;
427 if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
428 TokenPtr++;
429 Length++;
430 }
431 }
432
433 const StringRef CommandName(BufferPtr + 1, Length);
434 StringRef EndName;
435
436 if (isVerbatimBlockCommand(CommandName, EndName)) {
437 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, EndName);
438 return;
439 }
440 if (isVerbatimLineCommand(CommandName)) {
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000441 setupAndLexVerbatimLine(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000442 return;
443 }
444 formTokenWithChars(T, TokenPtr, tok::command);
445 T.setCommandName(CommandName);
446 return;
447 }
448
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000449 case '&':
450 lexHTMLCharacterReference(T);
451 return;
452
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000453 case '<': {
454 TokenPtr++;
455 if (TokenPtr == CommentEnd) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000456 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000457 return;
458 }
459 const char C = *TokenPtr;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000460 if (isHTMLIdentifierStartingCharacter(C))
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000461 setupAndLexHTMLStartTag(T);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000462 else if (C == '/')
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000463 setupAndLexHTMLEndTag(T);
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000464 else
465 formTextToken(T, TokenPtr);
466
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000467 return;
468 }
469
470 case '\n':
471 case '\r':
472 TokenPtr = skipNewline(TokenPtr, CommentEnd);
473 formTokenWithChars(T, TokenPtr, tok::newline);
474
475 if (CommentState == LCS_InsideCComment)
476 skipLineStartingDecorations();
477 return;
478
479 default: {
480 while (true) {
481 TokenPtr++;
482 if (TokenPtr == CommentEnd)
483 break;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000484 const char C = *TokenPtr;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000485 if(C == '\n' || C == '\r' ||
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000486 C == '\\' || C == '@' || C == '&' || C == '<')
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000487 break;
488 }
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000489 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000490 return;
491 }
492 }
493 }
494}
495
496void Lexer::setupAndLexVerbatimBlock(Token &T,
497 const char *TextBegin,
498 char Marker, StringRef EndName) {
499 VerbatimBlockEndCommandName.clear();
500 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
501 VerbatimBlockEndCommandName.append(EndName);
502
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000503 StringRef Name(BufferPtr + 1, TextBegin - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000504 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000505 T.setVerbatimBlockName(Name);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000506
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000507 // If there is a newline following the verbatim opening command, skip the
508 // newline so that we don't create an tok::verbatim_block_line with empty
509 // text content.
510 if (BufferPtr != CommentEnd) {
511 const char C = *BufferPtr;
512 if (C == '\n' || C == '\r') {
513 BufferPtr = skipNewline(BufferPtr, CommentEnd);
514 State = LS_VerbatimBlockBody;
515 return;
516 }
517 }
518
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000519 State = LS_VerbatimBlockFirstLine;
520}
521
522void Lexer::lexVerbatimBlockFirstLine(Token &T) {
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000523again:
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000524 assert(BufferPtr < CommentEnd);
525
526 // FIXME: It would be better to scan the text once, finding either the block
527 // end command or newline.
528 //
529 // Extract current line.
530 const char *Newline = findNewline(BufferPtr, CommentEnd);
531 StringRef Line(BufferPtr, Newline - BufferPtr);
532
533 // Look for end command in current line.
534 size_t Pos = Line.find(VerbatimBlockEndCommandName);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000535 const char *TextEnd;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000536 const char *NextLine;
537 if (Pos == StringRef::npos) {
538 // Current line is completely verbatim.
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000539 TextEnd = Newline;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000540 NextLine = skipNewline(Newline, CommentEnd);
541 } else if (Pos == 0) {
542 // Current line contains just an end command.
543 const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000544 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000545 formTokenWithChars(T, End, tok::verbatim_block_end);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000546 T.setVerbatimBlockName(Name);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000547 State = LS_Normal;
548 return;
549 } else {
550 // There is some text, followed by end command. Extract text first.
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000551 TextEnd = BufferPtr + Pos;
552 NextLine = TextEnd;
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000553 // If there is only whitespace before end command, skip whitespace.
554 if (isWhitespace(BufferPtr, TextEnd)) {
555 BufferPtr = TextEnd;
556 goto again;
557 }
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000558 }
559
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000560 StringRef Text(BufferPtr, TextEnd - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000561 formTokenWithChars(T, NextLine, tok::verbatim_block_line);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000562 T.setVerbatimBlockText(Text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000563
564 State = LS_VerbatimBlockBody;
565}
566
567void Lexer::lexVerbatimBlockBody(Token &T) {
568 assert(State == LS_VerbatimBlockBody);
569
570 if (CommentState == LCS_InsideCComment)
571 skipLineStartingDecorations();
572
573 lexVerbatimBlockFirstLine(T);
574}
575
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000576void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin) {
577 const StringRef Name(BufferPtr + 1, TextBegin - BufferPtr - 1);
578 formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
579 T.setVerbatimLineName(Name);
580
581 State = LS_VerbatimLineText;
582}
583
584void Lexer::lexVerbatimLineText(Token &T) {
585 assert(State == LS_VerbatimLineText);
586
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000587 // Extract current line.
588 const char *Newline = findNewline(BufferPtr, CommentEnd);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000589 const StringRef Text(BufferPtr, Newline - BufferPtr);
590 formTokenWithChars(T, Newline, tok::verbatim_line_text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000591 T.setVerbatimLineText(Text);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000592
593 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000594}
595
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000596void Lexer::lexHTMLCharacterReference(Token &T) {
597 const char *TokenPtr = BufferPtr;
598 assert(*TokenPtr == '&');
599 TokenPtr++;
600 if (TokenPtr == CommentEnd) {
601 formTextToken(T, TokenPtr);
602 return;
603 }
604 const char *NamePtr;
605 bool isNamed = false;
606 bool isDecimal = false;
607 char C = *TokenPtr;
608 if (isHTMLNamedCharacterReferenceCharacter(C)) {
609 NamePtr = TokenPtr;
610 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
611 isNamed = true;
612 } else if (C == '#') {
613 TokenPtr++;
614 if (TokenPtr == CommentEnd) {
615 formTextToken(T, TokenPtr);
616 return;
617 }
618 C = *TokenPtr;
619 if (isHTMLDecimalCharacterReferenceCharacter(C)) {
620 NamePtr = TokenPtr;
621 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
622 isDecimal = true;
623 } else if (C == 'x' || C == 'X') {
624 TokenPtr++;
625 NamePtr = TokenPtr;
626 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
627 } else {
628 formTextToken(T, TokenPtr);
629 return;
630 }
631 } else {
632 formTextToken(T, TokenPtr);
633 return;
634 }
635 if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
636 *TokenPtr != ';') {
637 formTextToken(T, TokenPtr);
638 return;
639 }
640 StringRef Name(NamePtr, TokenPtr - NamePtr);
641 TokenPtr++; // Skip semicolon.
642 StringRef Resolved;
643 if (isNamed)
644 Resolved = resolveHTMLNamedCharacterReference(Name);
645 else if (isDecimal)
646 Resolved = resolveHTMLDecimalCharacterReference(Name);
647 else
648 Resolved = resolveHTMLHexCharacterReference(Name);
649
650 if (Resolved.empty()) {
651 formTextToken(T, TokenPtr);
652 return;
653 }
654 formTokenWithChars(T, TokenPtr, tok::text);
655 T.setText(Resolved);
656 return;
657}
658
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000659void Lexer::setupAndLexHTMLStartTag(Token &T) {
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000660 assert(BufferPtr[0] == '<' &&
661 isHTMLIdentifierStartingCharacter(BufferPtr[1]));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000662 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
663
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000664 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000665 formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
666 T.setHTMLTagStartName(Name);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000667
668 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
669
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000670 const char C = *BufferPtr;
671 if (BufferPtr != CommentEnd &&
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000672 (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000673 State = LS_HTMLStartTag;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000674}
675
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000676void Lexer::lexHTMLStartTag(Token &T) {
677 assert(State == LS_HTMLStartTag);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000678
679 const char *TokenPtr = BufferPtr;
680 char C = *TokenPtr;
681 if (isHTMLIdentifierCharacter(C)) {
682 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000683 StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000684 formTokenWithChars(T, TokenPtr, tok::html_ident);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000685 T.setHTMLIdent(Ident);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000686 } else {
687 switch (C) {
688 case '=':
689 TokenPtr++;
690 formTokenWithChars(T, TokenPtr, tok::html_equals);
691 break;
692 case '\"':
693 case '\'': {
694 const char *OpenQuote = TokenPtr;
695 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
696 const char *ClosingQuote = TokenPtr;
697 if (TokenPtr != CommentEnd) // Skip closing quote.
698 TokenPtr++;
699 formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
700 T.setHTMLQuotedString(StringRef(OpenQuote + 1,
701 ClosingQuote - (OpenQuote + 1)));
702 break;
703 }
704 case '>':
705 TokenPtr++;
706 formTokenWithChars(T, TokenPtr, tok::html_greater);
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000707 State = LS_Normal;
708 return;
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000709 case '/':
710 TokenPtr++;
711 if (TokenPtr != CommentEnd && *TokenPtr == '>') {
712 TokenPtr++;
713 formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000714 } else
715 formTextToken(T, TokenPtr);
716
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000717 State = LS_Normal;
718 return;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000719 }
720 }
721
722 // Now look ahead and return to normal state if we don't see any HTML tokens
723 // ahead.
724 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
725 if (BufferPtr == CommentEnd) {
726 State = LS_Normal;
727 return;
728 }
729
730 C = *BufferPtr;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000731 if (!isHTMLIdentifierStartingCharacter(C) &&
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000732 C != '=' && C != '\"' && C != '\'' && C != '>') {
733 State = LS_Normal;
734 return;
735 }
736}
737
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000738void Lexer::setupAndLexHTMLEndTag(Token &T) {
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000739 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
740
741 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
742 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
743
744 const char *End = skipWhitespace(TagNameEnd, CommentEnd);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000745
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000746 formTokenWithChars(T, End, tok::html_end_tag);
747 T.setHTMLTagEndName(StringRef(TagNameBegin, TagNameEnd - TagNameBegin));
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000748
749 if (BufferPtr != CommentEnd && *BufferPtr == '>')
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000750 State = LS_HTMLEndTag;
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000751}
752
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000753void Lexer::lexHTMLEndTag(Token &T) {
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000754 assert(BufferPtr != CommentEnd && *BufferPtr == '>');
755
756 formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
757 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000758}
759
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000760Lexer::Lexer(llvm::BumpPtrAllocator &Allocator,
761 SourceLocation FileLoc, const CommentOptions &CommOpts,
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000762 const char *BufferStart, const char *BufferEnd):
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000763 Allocator(Allocator),
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000764 BufferStart(BufferStart), BufferEnd(BufferEnd),
765 FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart),
766 CommentState(LCS_BeforeComment), State(LS_Normal) {
767}
768
769void Lexer::lex(Token &T) {
770again:
771 switch (CommentState) {
772 case LCS_BeforeComment:
773 if (BufferPtr == BufferEnd) {
774 formTokenWithChars(T, BufferPtr, tok::eof);
775 return;
776 }
777
778 assert(*BufferPtr == '/');
779 BufferPtr++; // Skip first slash.
780 switch(*BufferPtr) {
781 case '/': { // BCPL comment.
782 BufferPtr++; // Skip second slash.
783
784 if (BufferPtr != BufferEnd) {
785 // Skip Doxygen magic marker, if it is present.
786 // It might be missing because of a typo //< or /*<, or because we
787 // merged this non-Doxygen comment into a bunch of Doxygen comments
788 // around it: /** ... */ /* ... */ /** ... */
789 const char C = *BufferPtr;
790 if (C == '/' || C == '!')
791 BufferPtr++;
792 }
793
794 // Skip less-than symbol that marks trailing comments.
795 // Skip it even if the comment is not a Doxygen one, because //< and /*<
796 // are frequent typos.
797 if (BufferPtr != BufferEnd && *BufferPtr == '<')
798 BufferPtr++;
799
800 CommentState = LCS_InsideBCPLComment;
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000801 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
802 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000803 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
804 goto again;
805 }
806 case '*': { // C comment.
807 BufferPtr++; // Skip star.
808
809 // Skip Doxygen magic marker.
810 const char C = *BufferPtr;
811 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
812 BufferPtr++;
813
814 // Skip less-than symbol that marks trailing comments.
815 if (BufferPtr != BufferEnd && *BufferPtr == '<')
816 BufferPtr++;
817
818 CommentState = LCS_InsideCComment;
819 State = LS_Normal;
820 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
821 goto again;
822 }
823 default:
824 llvm_unreachable("second character of comment should be '/' or '*'");
825 }
826
827 case LCS_BetweenComments: {
828 // Consecutive comments are extracted only if there is only whitespace
829 // between them. So we can search for the start of the next comment.
830 const char *EndWhitespace = BufferPtr;
831 while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
832 EndWhitespace++;
833
834 // Turn any whitespace between comments (and there is only whitespace
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000835 // between them -- guaranteed by comment extraction) into a newline. We
836 // have two newlines between C comments in total (first one was synthesized
837 // after a comment).
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000838 formTokenWithChars(T, EndWhitespace, tok::newline);
839
840 CommentState = LCS_BeforeComment;
841 break;
842 }
843
844 case LCS_InsideBCPLComment:
845 case LCS_InsideCComment:
846 if (BufferPtr != CommentEnd) {
847 lexCommentText(T);
848 break;
849 } else {
850 // Skip C comment closing sequence.
851 if (CommentState == LCS_InsideCComment) {
852 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
853 BufferPtr += 2;
854 assert(BufferPtr <= BufferEnd);
855
856 // Synthenize newline just after the C comment, regardless if there is
857 // actually a newline.
858 formTokenWithChars(T, BufferPtr, tok::newline);
859
860 CommentState = LCS_BetweenComments;
861 break;
862 } else {
863 // Don't synthesized a newline after BCPL comment.
864 CommentState = LCS_BetweenComments;
865 goto again;
866 }
867 }
868 }
869}
870
871StringRef Lexer::getSpelling(const Token &Tok,
872 const SourceManager &SourceMgr,
873 bool *Invalid) const {
874 SourceLocation Loc = Tok.getLocation();
875 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
876
877 bool InvalidTemp = false;
878 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
879 if (InvalidTemp) {
880 *Invalid = true;
881 return StringRef();
882 }
883
884 const char *Begin = File.data() + LocInfo.second;
885 return StringRef(Begin, Tok.getLength());
886}
887
888void Lexer::addVerbatimBlockCommand(StringRef BeginName, StringRef EndName) {
889 VerbatimBlockCommand VBC;
890 VBC.BeginName = BeginName;
891 VBC.EndName = EndName;
892 VerbatimBlockCommands.push_back(VBC);
893}
894
895void Lexer::addVerbatimLineCommand(StringRef Name) {
896 VerbatimLineCommand VLC;
897 VLC.Name = Name;
898 VerbatimLineCommands.push_back(VLC);
899}
900
901} // end namespace comments
902} // end namespace clang
903