blob: 870db2be5f9720ee77517683392028252c066529 [file] [log] [blame]
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00001#include "clang/AST/CommentLexer.h"
Dmitri Gribenkoaa580812012-08-09 00:03:17 +00002#include "clang/AST/CommentCommandTraits.h"
Dmitri Gribenko477a9f52012-07-27 20:37:06 +00003#include "clang/Basic/ConvertUTF.h"
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00004#include "llvm/ADT/StringSwitch.h"
5#include "llvm/Support/ErrorHandling.h"
6
7namespace clang {
8namespace comments {
9
10void Token::dump(const Lexer &L, const SourceManager &SM) const {
11 llvm::errs() << "comments::Token Kind=" << Kind << " ";
12 Loc.dump(SM);
13 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
14}
15
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000016namespace {
17bool isHTMLNamedCharacterReferenceCharacter(char C) {
18 return (C >= 'a' && C <= 'z') ||
19 (C >= 'A' && C <= 'Z');
20}
21
22bool isHTMLDecimalCharacterReferenceCharacter(char C) {
23 return C >= '0' && C <= '9';
24}
25
26bool isHTMLHexCharacterReferenceCharacter(char C) {
27 return (C >= '0' && C <= '9') ||
28 (C >= 'a' && C <= 'f') ||
29 (C >= 'A' && C <= 'F');
30}
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +000031
32bool isHTMLTagName(StringRef Name) {
33 return llvm::StringSwitch<bool>(Name)
34 .Cases("em", "strong", true)
35 .Cases("tt", "i", "b", "big", "small", true)
36 .Cases("strike", "s", "u", "font", true)
37 .Case("a", true)
38 .Case("hr", true)
39 .Cases("div", "span", true)
40 .Cases("h1", "h2", "h3", true)
41 .Cases("h4", "h5", "h6", true)
42 .Case("code", true)
43 .Case("blockquote", true)
44 .Cases("sub", "sup", true)
45 .Case("img", true)
46 .Case("p", true)
47 .Case("br", true)
48 .Case("pre", true)
49 .Cases("ins", "del", true)
50 .Cases("ul", "ol", "li", true)
51 .Cases("dl", "dt", "dd", true)
52 .Cases("table", "caption", true)
53 .Cases("thead", "tfoot", "tbody", true)
54 .Cases("colgroup", "col", true)
55 .Cases("tr", "th", "td", true)
56 .Default(false);
57}
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000058} // unnamed namespace
59
60StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
61 return llvm::StringSwitch<StringRef>(Name)
62 .Case("amp", "&")
63 .Case("lt", "<")
64 .Case("gt", ">")
65 .Case("quot", "\"")
66 .Case("apos", "\'")
67 .Default("");
68}
69
70StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
71 unsigned CodePoint = 0;
72 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
73 assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
74 CodePoint *= 10;
75 CodePoint += Name[i] - '0';
76 }
77
78 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
79 char *ResolvedPtr = Resolved;
80 if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
81 return StringRef(Resolved, ResolvedPtr - Resolved);
82 else
83 return StringRef();
84}
85
86StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
87 unsigned CodePoint = 0;
88 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
89 CodePoint *= 16;
90 const char C = Name[i];
91 assert(isHTMLHexCharacterReferenceCharacter(C));
92 if (C >= '0' && C <= '9')
93 CodePoint += Name[i] - '0';
94 else if (C >= 'a' && C <= 'f')
95 CodePoint += Name[i] - 'a' + 10;
96 else
97 CodePoint += Name[i] - 'A' + 10;
98 }
99
100 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
101 char *ResolvedPtr = Resolved;
102 if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
103 return StringRef(Resolved, ResolvedPtr - Resolved);
104 else
105 return StringRef();
106}
107
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000108void Lexer::skipLineStartingDecorations() {
109 // This function should be called only for C comments
110 assert(CommentState == LCS_InsideCComment);
111
112 if (BufferPtr == CommentEnd)
113 return;
114
115 switch (*BufferPtr) {
116 case ' ':
117 case '\t':
118 case '\f':
119 case '\v': {
120 const char *NewBufferPtr = BufferPtr;
121 NewBufferPtr++;
122 if (NewBufferPtr == CommentEnd)
123 return;
124
125 char C = *NewBufferPtr;
126 while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
127 NewBufferPtr++;
128 if (NewBufferPtr == CommentEnd)
129 return;
130 C = *NewBufferPtr;
131 }
132 if (C == '*')
133 BufferPtr = NewBufferPtr + 1;
134 break;
135 }
136 case '*':
137 BufferPtr++;
138 break;
139 }
140}
141
142namespace {
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000143/// Returns pointer to the first newline character in the string.
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000144const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
145 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
146 const char C = *BufferPtr;
147 if (C == '\n' || C == '\r')
148 return BufferPtr;
149 }
150 return BufferEnd;
151}
152
153const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
154 if (BufferPtr == BufferEnd)
155 return BufferPtr;
156
157 if (*BufferPtr == '\n')
158 BufferPtr++;
159 else {
160 assert(*BufferPtr == '\r');
161 BufferPtr++;
162 if (BufferPtr != BufferEnd && *BufferPtr == '\n')
163 BufferPtr++;
164 }
165 return BufferPtr;
166}
167
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000168const char *skipNamedCharacterReference(const char *BufferPtr,
169 const char *BufferEnd) {
170 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
171 if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
172 return BufferPtr;
173 }
174 return BufferEnd;
175}
176
177const char *skipDecimalCharacterReference(const char *BufferPtr,
178 const char *BufferEnd) {
179 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
180 if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
181 return BufferPtr;
182 }
183 return BufferEnd;
184}
185
186const char *skipHexCharacterReference(const char *BufferPtr,
187 const char *BufferEnd) {
188 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
189 if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
190 return BufferPtr;
191 }
192 return BufferEnd;
193}
194
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000195bool isHTMLIdentifierStartingCharacter(char C) {
196 return (C >= 'a' && C <= 'z') ||
197 (C >= 'A' && C <= 'Z');
198}
199
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000200bool isHTMLIdentifierCharacter(char C) {
201 return (C >= 'a' && C <= 'z') ||
202 (C >= 'A' && C <= 'Z') ||
203 (C >= '0' && C <= '9');
204}
205
206const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
207 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
208 if (!isHTMLIdentifierCharacter(*BufferPtr))
209 return BufferPtr;
210 }
211 return BufferEnd;
212}
213
214/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
215/// string allowed.
216///
217/// Returns pointer to closing quote.
218const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
219{
220 const char Quote = *BufferPtr;
221 assert(Quote == '\"' || Quote == '\'');
222
223 BufferPtr++;
224 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
225 const char C = *BufferPtr;
226 if (C == Quote && BufferPtr[-1] != '\\')
227 return BufferPtr;
228 }
229 return BufferEnd;
230}
231
232bool isHorizontalWhitespace(char C) {
233 return C == ' ' || C == '\t' || C == '\f' || C == '\v';
234}
235
236bool isWhitespace(char C) {
237 return C == ' ' || C == '\n' || C == '\r' ||
238 C == '\t' || C == '\f' || C == '\v';
239}
240
241const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
242 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
243 if (!isWhitespace(*BufferPtr))
244 return BufferPtr;
245 }
246 return BufferEnd;
247}
248
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000249bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
250 return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
251}
252
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000253bool isCommandNameCharacter(char C) {
254 return (C >= 'a' && C <= 'z') ||
255 (C >= 'A' && C <= 'Z') ||
256 (C >= '0' && C <= '9');
257}
258
259const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
260 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
261 if (!isCommandNameCharacter(*BufferPtr))
262 return BufferPtr;
263 }
264 return BufferEnd;
265}
266
267/// Return the one past end pointer for BCPL comments.
268/// Handles newlines escaped with backslash or trigraph for backslahs.
269const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
270 const char *CurPtr = BufferPtr;
271 while (CurPtr != BufferEnd) {
272 char C = *CurPtr;
273 while (C != '\n' && C != '\r') {
274 CurPtr++;
275 if (CurPtr == BufferEnd)
276 return BufferEnd;
277 C = *CurPtr;
278 }
279 // We found a newline, check if it is escaped.
280 const char *EscapePtr = CurPtr - 1;
281 while(isHorizontalWhitespace(*EscapePtr))
282 EscapePtr--;
283
284 if (*EscapePtr == '\\' ||
285 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
286 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
287 // We found an escaped newline.
288 CurPtr = skipNewline(CurPtr, BufferEnd);
289 } else
290 return CurPtr; // Not an escaped newline.
291 }
292 return BufferEnd;
293}
294
295/// Return the one past end pointer for C comments.
296/// Very dumb, does not handle escaped newlines or trigraphs.
297const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
298 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
299 if (*BufferPtr == '*') {
300 assert(BufferPtr + 1 != BufferEnd);
301 if (*(BufferPtr + 1) == '/')
302 return BufferPtr;
303 }
304 }
305 llvm_unreachable("buffer end hit before '*/' was seen");
306}
307} // unnamed namespace
308
309void Lexer::lexCommentText(Token &T) {
310 assert(CommentState == LCS_InsideBCPLComment ||
311 CommentState == LCS_InsideCComment);
312
313 switch (State) {
314 case LS_Normal:
315 break;
316 case LS_VerbatimBlockFirstLine:
317 lexVerbatimBlockFirstLine(T);
318 return;
319 case LS_VerbatimBlockBody:
320 lexVerbatimBlockBody(T);
321 return;
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000322 case LS_VerbatimLineText:
323 lexVerbatimLineText(T);
324 return;
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000325 case LS_HTMLStartTag:
326 lexHTMLStartTag(T);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000327 return;
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000328 case LS_HTMLEndTag:
329 lexHTMLEndTag(T);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000330 return;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000331 }
332
333 assert(State == LS_Normal);
334
335 const char *TokenPtr = BufferPtr;
336 assert(TokenPtr < CommentEnd);
337 while (TokenPtr != CommentEnd) {
338 switch(*TokenPtr) {
339 case '\\':
340 case '@': {
341 TokenPtr++;
342 if (TokenPtr == CommentEnd) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000343 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000344 return;
345 }
346 char C = *TokenPtr;
347 switch (C) {
348 default:
349 break;
350
351 case '\\': case '@': case '&': case '$':
352 case '#': case '<': case '>': case '%':
353 case '\"': case '.': case ':':
354 // This is one of \\ \@ \& \$ etc escape sequences.
355 TokenPtr++;
356 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
357 // This is the \:: escape sequence.
358 TokenPtr++;
359 }
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000360 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000361 formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000362 T.setText(UnescapedText);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000363 return;
364 }
365
366 // Don't make zero-length commands.
367 if (!isCommandNameCharacter(*TokenPtr)) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000368 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000369 return;
370 }
371
372 TokenPtr = skipCommandName(TokenPtr, CommentEnd);
373 unsigned Length = TokenPtr - (BufferPtr + 1);
374
375 // Hardcoded support for lexing LaTeX formula commands
376 // \f$ \f[ \f] \f{ \f} as a single command.
377 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
378 C = *TokenPtr;
379 if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
380 TokenPtr++;
381 Length++;
382 }
383 }
384
385 const StringRef CommandName(BufferPtr + 1, Length);
386 StringRef EndName;
387
Dmitri Gribenkoaa580812012-08-09 00:03:17 +0000388 if (Traits.isVerbatimBlockCommand(CommandName, EndName)) {
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000389 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, EndName);
390 return;
391 }
Dmitri Gribenkoaa580812012-08-09 00:03:17 +0000392 if (Traits.isVerbatimLineCommand(CommandName)) {
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000393 setupAndLexVerbatimLine(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000394 return;
395 }
396 formTokenWithChars(T, TokenPtr, tok::command);
397 T.setCommandName(CommandName);
398 return;
399 }
400
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000401 case '&':
402 lexHTMLCharacterReference(T);
403 return;
404
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000405 case '<': {
406 TokenPtr++;
407 if (TokenPtr == CommentEnd) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000408 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000409 return;
410 }
411 const char C = *TokenPtr;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000412 if (isHTMLIdentifierStartingCharacter(C))
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000413 setupAndLexHTMLStartTag(T);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000414 else if (C == '/')
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000415 setupAndLexHTMLEndTag(T);
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000416 else
417 formTextToken(T, TokenPtr);
418
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000419 return;
420 }
421
422 case '\n':
423 case '\r':
424 TokenPtr = skipNewline(TokenPtr, CommentEnd);
425 formTokenWithChars(T, TokenPtr, tok::newline);
426
427 if (CommentState == LCS_InsideCComment)
428 skipLineStartingDecorations();
429 return;
430
431 default: {
432 while (true) {
433 TokenPtr++;
434 if (TokenPtr == CommentEnd)
435 break;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000436 const char C = *TokenPtr;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000437 if(C == '\n' || C == '\r' ||
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000438 C == '\\' || C == '@' || C == '&' || C == '<')
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000439 break;
440 }
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000441 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000442 return;
443 }
444 }
445 }
446}
447
448void Lexer::setupAndLexVerbatimBlock(Token &T,
449 const char *TextBegin,
450 char Marker, StringRef EndName) {
451 VerbatimBlockEndCommandName.clear();
452 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
453 VerbatimBlockEndCommandName.append(EndName);
454
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000455 StringRef Name(BufferPtr + 1, TextBegin - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000456 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000457 T.setVerbatimBlockName(Name);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000458
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000459 // If there is a newline following the verbatim opening command, skip the
460 // newline so that we don't create an tok::verbatim_block_line with empty
461 // text content.
462 if (BufferPtr != CommentEnd) {
463 const char C = *BufferPtr;
464 if (C == '\n' || C == '\r') {
465 BufferPtr = skipNewline(BufferPtr, CommentEnd);
466 State = LS_VerbatimBlockBody;
467 return;
468 }
469 }
470
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000471 State = LS_VerbatimBlockFirstLine;
472}
473
474void Lexer::lexVerbatimBlockFirstLine(Token &T) {
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000475again:
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000476 assert(BufferPtr < CommentEnd);
477
478 // FIXME: It would be better to scan the text once, finding either the block
479 // end command or newline.
480 //
481 // Extract current line.
482 const char *Newline = findNewline(BufferPtr, CommentEnd);
483 StringRef Line(BufferPtr, Newline - BufferPtr);
484
485 // Look for end command in current line.
486 size_t Pos = Line.find(VerbatimBlockEndCommandName);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000487 const char *TextEnd;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000488 const char *NextLine;
489 if (Pos == StringRef::npos) {
490 // Current line is completely verbatim.
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000491 TextEnd = Newline;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000492 NextLine = skipNewline(Newline, CommentEnd);
493 } else if (Pos == 0) {
494 // Current line contains just an end command.
495 const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000496 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000497 formTokenWithChars(T, End, tok::verbatim_block_end);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000498 T.setVerbatimBlockName(Name);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000499 State = LS_Normal;
500 return;
501 } else {
502 // There is some text, followed by end command. Extract text first.
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000503 TextEnd = BufferPtr + Pos;
504 NextLine = TextEnd;
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000505 // If there is only whitespace before end command, skip whitespace.
506 if (isWhitespace(BufferPtr, TextEnd)) {
507 BufferPtr = TextEnd;
508 goto again;
509 }
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000510 }
511
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000512 StringRef Text(BufferPtr, TextEnd - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000513 formTokenWithChars(T, NextLine, tok::verbatim_block_line);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000514 T.setVerbatimBlockText(Text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000515
516 State = LS_VerbatimBlockBody;
517}
518
519void Lexer::lexVerbatimBlockBody(Token &T) {
520 assert(State == LS_VerbatimBlockBody);
521
522 if (CommentState == LCS_InsideCComment)
523 skipLineStartingDecorations();
524
525 lexVerbatimBlockFirstLine(T);
526}
527
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000528void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin) {
529 const StringRef Name(BufferPtr + 1, TextBegin - BufferPtr - 1);
530 formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
531 T.setVerbatimLineName(Name);
532
533 State = LS_VerbatimLineText;
534}
535
536void Lexer::lexVerbatimLineText(Token &T) {
537 assert(State == LS_VerbatimLineText);
538
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000539 // Extract current line.
540 const char *Newline = findNewline(BufferPtr, CommentEnd);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000541 const StringRef Text(BufferPtr, Newline - BufferPtr);
542 formTokenWithChars(T, Newline, tok::verbatim_line_text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000543 T.setVerbatimLineText(Text);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000544
545 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000546}
547
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000548void Lexer::lexHTMLCharacterReference(Token &T) {
549 const char *TokenPtr = BufferPtr;
550 assert(*TokenPtr == '&');
551 TokenPtr++;
552 if (TokenPtr == CommentEnd) {
553 formTextToken(T, TokenPtr);
554 return;
555 }
556 const char *NamePtr;
557 bool isNamed = false;
558 bool isDecimal = false;
559 char C = *TokenPtr;
560 if (isHTMLNamedCharacterReferenceCharacter(C)) {
561 NamePtr = TokenPtr;
562 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
563 isNamed = true;
564 } else if (C == '#') {
565 TokenPtr++;
566 if (TokenPtr == CommentEnd) {
567 formTextToken(T, TokenPtr);
568 return;
569 }
570 C = *TokenPtr;
571 if (isHTMLDecimalCharacterReferenceCharacter(C)) {
572 NamePtr = TokenPtr;
573 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
574 isDecimal = true;
575 } else if (C == 'x' || C == 'X') {
576 TokenPtr++;
577 NamePtr = TokenPtr;
578 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
579 } else {
580 formTextToken(T, TokenPtr);
581 return;
582 }
583 } else {
584 formTextToken(T, TokenPtr);
585 return;
586 }
587 if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
588 *TokenPtr != ';') {
589 formTextToken(T, TokenPtr);
590 return;
591 }
592 StringRef Name(NamePtr, TokenPtr - NamePtr);
593 TokenPtr++; // Skip semicolon.
594 StringRef Resolved;
595 if (isNamed)
596 Resolved = resolveHTMLNamedCharacterReference(Name);
597 else if (isDecimal)
598 Resolved = resolveHTMLDecimalCharacterReference(Name);
599 else
600 Resolved = resolveHTMLHexCharacterReference(Name);
601
602 if (Resolved.empty()) {
603 formTextToken(T, TokenPtr);
604 return;
605 }
606 formTokenWithChars(T, TokenPtr, tok::text);
607 T.setText(Resolved);
608 return;
609}
610
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000611void Lexer::setupAndLexHTMLStartTag(Token &T) {
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000612 assert(BufferPtr[0] == '<' &&
613 isHTMLIdentifierStartingCharacter(BufferPtr[1]));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000614 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000615 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000616 if (!isHTMLTagName(Name)) {
617 formTextToken(T, TagNameEnd);
618 return;
619 }
620
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000621 formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
622 T.setHTMLTagStartName(Name);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000623
624 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
625
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000626 const char C = *BufferPtr;
627 if (BufferPtr != CommentEnd &&
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000628 (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000629 State = LS_HTMLStartTag;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000630}
631
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000632void Lexer::lexHTMLStartTag(Token &T) {
633 assert(State == LS_HTMLStartTag);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000634
635 const char *TokenPtr = BufferPtr;
636 char C = *TokenPtr;
637 if (isHTMLIdentifierCharacter(C)) {
638 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000639 StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000640 formTokenWithChars(T, TokenPtr, tok::html_ident);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000641 T.setHTMLIdent(Ident);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000642 } else {
643 switch (C) {
644 case '=':
645 TokenPtr++;
646 formTokenWithChars(T, TokenPtr, tok::html_equals);
647 break;
648 case '\"':
649 case '\'': {
650 const char *OpenQuote = TokenPtr;
651 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
652 const char *ClosingQuote = TokenPtr;
653 if (TokenPtr != CommentEnd) // Skip closing quote.
654 TokenPtr++;
655 formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
656 T.setHTMLQuotedString(StringRef(OpenQuote + 1,
657 ClosingQuote - (OpenQuote + 1)));
658 break;
659 }
660 case '>':
661 TokenPtr++;
662 formTokenWithChars(T, TokenPtr, tok::html_greater);
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000663 State = LS_Normal;
664 return;
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000665 case '/':
666 TokenPtr++;
667 if (TokenPtr != CommentEnd && *TokenPtr == '>') {
668 TokenPtr++;
669 formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000670 } else
671 formTextToken(T, TokenPtr);
672
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000673 State = LS_Normal;
674 return;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000675 }
676 }
677
678 // Now look ahead and return to normal state if we don't see any HTML tokens
679 // ahead.
680 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
681 if (BufferPtr == CommentEnd) {
682 State = LS_Normal;
683 return;
684 }
685
686 C = *BufferPtr;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000687 if (!isHTMLIdentifierStartingCharacter(C) &&
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000688 C != '=' && C != '\"' && C != '\'' && C != '>') {
689 State = LS_Normal;
690 return;
691 }
692}
693
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000694void Lexer::setupAndLexHTMLEndTag(Token &T) {
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000695 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
696
697 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
698 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000699 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
700 if (!isHTMLTagName(Name)) {
701 formTextToken(T, TagNameEnd);
702 return;
703 }
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000704
705 const char *End = skipWhitespace(TagNameEnd, CommentEnd);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000706
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000707 formTokenWithChars(T, End, tok::html_end_tag);
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000708 T.setHTMLTagEndName(Name);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000709
710 if (BufferPtr != CommentEnd && *BufferPtr == '>')
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000711 State = LS_HTMLEndTag;
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000712}
713
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000714void Lexer::lexHTMLEndTag(Token &T) {
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000715 assert(BufferPtr != CommentEnd && *BufferPtr == '>');
716
717 formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
718 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000719}
720
Dmitri Gribenkoaa580812012-08-09 00:03:17 +0000721Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000722 SourceLocation FileLoc, const CommentOptions &CommOpts,
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000723 const char *BufferStart, const char *BufferEnd):
Dmitri Gribenkoaa580812012-08-09 00:03:17 +0000724 Allocator(Allocator), Traits(Traits),
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000725 BufferStart(BufferStart), BufferEnd(BufferEnd),
726 FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart),
727 CommentState(LCS_BeforeComment), State(LS_Normal) {
728}
729
730void Lexer::lex(Token &T) {
731again:
732 switch (CommentState) {
733 case LCS_BeforeComment:
734 if (BufferPtr == BufferEnd) {
735 formTokenWithChars(T, BufferPtr, tok::eof);
736 return;
737 }
738
739 assert(*BufferPtr == '/');
740 BufferPtr++; // Skip first slash.
741 switch(*BufferPtr) {
742 case '/': { // BCPL comment.
743 BufferPtr++; // Skip second slash.
744
745 if (BufferPtr != BufferEnd) {
746 // Skip Doxygen magic marker, if it is present.
747 // It might be missing because of a typo //< or /*<, or because we
748 // merged this non-Doxygen comment into a bunch of Doxygen comments
749 // around it: /** ... */ /* ... */ /** ... */
750 const char C = *BufferPtr;
751 if (C == '/' || C == '!')
752 BufferPtr++;
753 }
754
755 // Skip less-than symbol that marks trailing comments.
756 // Skip it even if the comment is not a Doxygen one, because //< and /*<
757 // are frequent typos.
758 if (BufferPtr != BufferEnd && *BufferPtr == '<')
759 BufferPtr++;
760
761 CommentState = LCS_InsideBCPLComment;
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000762 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
763 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000764 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
765 goto again;
766 }
767 case '*': { // C comment.
768 BufferPtr++; // Skip star.
769
770 // Skip Doxygen magic marker.
771 const char C = *BufferPtr;
772 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
773 BufferPtr++;
774
775 // Skip less-than symbol that marks trailing comments.
776 if (BufferPtr != BufferEnd && *BufferPtr == '<')
777 BufferPtr++;
778
779 CommentState = LCS_InsideCComment;
780 State = LS_Normal;
781 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
782 goto again;
783 }
784 default:
785 llvm_unreachable("second character of comment should be '/' or '*'");
786 }
787
788 case LCS_BetweenComments: {
789 // Consecutive comments are extracted only if there is only whitespace
790 // between them. So we can search for the start of the next comment.
791 const char *EndWhitespace = BufferPtr;
792 while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
793 EndWhitespace++;
794
795 // Turn any whitespace between comments (and there is only whitespace
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000796 // between them -- guaranteed by comment extraction) into a newline. We
797 // have two newlines between C comments in total (first one was synthesized
798 // after a comment).
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000799 formTokenWithChars(T, EndWhitespace, tok::newline);
800
801 CommentState = LCS_BeforeComment;
802 break;
803 }
804
805 case LCS_InsideBCPLComment:
806 case LCS_InsideCComment:
807 if (BufferPtr != CommentEnd) {
808 lexCommentText(T);
809 break;
810 } else {
811 // Skip C comment closing sequence.
812 if (CommentState == LCS_InsideCComment) {
813 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
814 BufferPtr += 2;
815 assert(BufferPtr <= BufferEnd);
816
817 // Synthenize newline just after the C comment, regardless if there is
818 // actually a newline.
819 formTokenWithChars(T, BufferPtr, tok::newline);
820
821 CommentState = LCS_BetweenComments;
822 break;
823 } else {
824 // Don't synthesized a newline after BCPL comment.
825 CommentState = LCS_BetweenComments;
826 goto again;
827 }
828 }
829 }
830}
831
832StringRef Lexer::getSpelling(const Token &Tok,
833 const SourceManager &SourceMgr,
834 bool *Invalid) const {
835 SourceLocation Loc = Tok.getLocation();
836 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
837
838 bool InvalidTemp = false;
839 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
840 if (InvalidTemp) {
841 *Invalid = true;
842 return StringRef();
843 }
844
845 const char *Begin = File.data() + LocInfo.second;
846 return StringRef(Begin, Tok.getLength());
847}
848
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000849} // end namespace comments
850} // end namespace clang
851