blob: 53440c169fb2871c14480fc57577c93b9f98adcb [file] [log] [blame]
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00001#include "clang/AST/CommentLexer.h"
Dmitri Gribenkoaa580812012-08-09 00:03:17 +00002#include "clang/AST/CommentCommandTraits.h"
Dmitri Gribenko477a9f52012-07-27 20:37:06 +00003#include "clang/Basic/ConvertUTF.h"
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00004#include "llvm/ADT/StringSwitch.h"
5#include "llvm/Support/ErrorHandling.h"
6
7namespace clang {
8namespace comments {
9
10void Token::dump(const Lexer &L, const SourceManager &SM) const {
11 llvm::errs() << "comments::Token Kind=" << Kind << " ";
12 Loc.dump(SM);
13 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
14}
15
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000016namespace {
17bool isHTMLNamedCharacterReferenceCharacter(char C) {
18 return (C >= 'a' && C <= 'z') ||
19 (C >= 'A' && C <= 'Z');
20}
21
22bool isHTMLDecimalCharacterReferenceCharacter(char C) {
23 return C >= '0' && C <= '9';
24}
25
26bool isHTMLHexCharacterReferenceCharacter(char C) {
27 return (C >= '0' && C <= '9') ||
28 (C >= 'a' && C <= 'f') ||
29 (C >= 'A' && C <= 'F');
30}
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +000031
Dmitri Gribenkoc24a76e2012-08-31 02:21:44 +000032#include "clang/AST/CommentHTMLTags.inc"
33
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000034} // unnamed namespace
35
36StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
37 return llvm::StringSwitch<StringRef>(Name)
38 .Case("amp", "&")
39 .Case("lt", "<")
40 .Case("gt", ">")
41 .Case("quot", "\"")
42 .Case("apos", "\'")
43 .Default("");
44}
45
46StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
47 unsigned CodePoint = 0;
48 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
49 assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
50 CodePoint *= 10;
51 CodePoint += Name[i] - '0';
52 }
53
54 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
55 char *ResolvedPtr = Resolved;
56 if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
57 return StringRef(Resolved, ResolvedPtr - Resolved);
58 else
59 return StringRef();
60}
61
62StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
63 unsigned CodePoint = 0;
64 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
65 CodePoint *= 16;
66 const char C = Name[i];
67 assert(isHTMLHexCharacterReferenceCharacter(C));
68 if (C >= '0' && C <= '9')
69 CodePoint += Name[i] - '0';
70 else if (C >= 'a' && C <= 'f')
71 CodePoint += Name[i] - 'a' + 10;
72 else
73 CodePoint += Name[i] - 'A' + 10;
74 }
75
76 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
77 char *ResolvedPtr = Resolved;
78 if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
79 return StringRef(Resolved, ResolvedPtr - Resolved);
80 else
81 return StringRef();
82}
83
Dmitri Gribenko2d44d772012-06-26 20:39:18 +000084void Lexer::skipLineStartingDecorations() {
85 // This function should be called only for C comments
86 assert(CommentState == LCS_InsideCComment);
87
88 if (BufferPtr == CommentEnd)
89 return;
90
91 switch (*BufferPtr) {
92 case ' ':
93 case '\t':
94 case '\f':
95 case '\v': {
96 const char *NewBufferPtr = BufferPtr;
97 NewBufferPtr++;
98 if (NewBufferPtr == CommentEnd)
99 return;
100
101 char C = *NewBufferPtr;
102 while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
103 NewBufferPtr++;
104 if (NewBufferPtr == CommentEnd)
105 return;
106 C = *NewBufferPtr;
107 }
108 if (C == '*')
109 BufferPtr = NewBufferPtr + 1;
110 break;
111 }
112 case '*':
113 BufferPtr++;
114 break;
115 }
116}
117
118namespace {
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000119/// Returns pointer to the first newline character in the string.
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000120const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
121 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
122 const char C = *BufferPtr;
123 if (C == '\n' || C == '\r')
124 return BufferPtr;
125 }
126 return BufferEnd;
127}
128
129const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
130 if (BufferPtr == BufferEnd)
131 return BufferPtr;
132
133 if (*BufferPtr == '\n')
134 BufferPtr++;
135 else {
136 assert(*BufferPtr == '\r');
137 BufferPtr++;
138 if (BufferPtr != BufferEnd && *BufferPtr == '\n')
139 BufferPtr++;
140 }
141 return BufferPtr;
142}
143
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000144const char *skipNamedCharacterReference(const char *BufferPtr,
145 const char *BufferEnd) {
146 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
147 if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
148 return BufferPtr;
149 }
150 return BufferEnd;
151}
152
153const char *skipDecimalCharacterReference(const char *BufferPtr,
154 const char *BufferEnd) {
155 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
156 if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
157 return BufferPtr;
158 }
159 return BufferEnd;
160}
161
162const char *skipHexCharacterReference(const char *BufferPtr,
163 const char *BufferEnd) {
164 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
165 if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
166 return BufferPtr;
167 }
168 return BufferEnd;
169}
170
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000171bool isHTMLIdentifierStartingCharacter(char C) {
172 return (C >= 'a' && C <= 'z') ||
173 (C >= 'A' && C <= 'Z');
174}
175
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000176bool isHTMLIdentifierCharacter(char C) {
177 return (C >= 'a' && C <= 'z') ||
178 (C >= 'A' && C <= 'Z') ||
179 (C >= '0' && C <= '9');
180}
181
182const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
183 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
184 if (!isHTMLIdentifierCharacter(*BufferPtr))
185 return BufferPtr;
186 }
187 return BufferEnd;
188}
189
190/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
191/// string allowed.
192///
193/// Returns pointer to closing quote.
194const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
195{
196 const char Quote = *BufferPtr;
197 assert(Quote == '\"' || Quote == '\'');
198
199 BufferPtr++;
200 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
201 const char C = *BufferPtr;
202 if (C == Quote && BufferPtr[-1] != '\\')
203 return BufferPtr;
204 }
205 return BufferEnd;
206}
207
208bool isHorizontalWhitespace(char C) {
209 return C == ' ' || C == '\t' || C == '\f' || C == '\v';
210}
211
212bool isWhitespace(char C) {
213 return C == ' ' || C == '\n' || C == '\r' ||
214 C == '\t' || C == '\f' || C == '\v';
215}
216
217const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
218 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
219 if (!isWhitespace(*BufferPtr))
220 return BufferPtr;
221 }
222 return BufferEnd;
223}
224
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000225bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
226 return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
227}
228
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000229bool isCommandNameCharacter(char C) {
230 return (C >= 'a' && C <= 'z') ||
231 (C >= 'A' && C <= 'Z') ||
232 (C >= '0' && C <= '9');
233}
234
235const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
236 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
237 if (!isCommandNameCharacter(*BufferPtr))
238 return BufferPtr;
239 }
240 return BufferEnd;
241}
242
243/// Return the one past end pointer for BCPL comments.
244/// Handles newlines escaped with backslash or trigraph for backslahs.
245const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
246 const char *CurPtr = BufferPtr;
247 while (CurPtr != BufferEnd) {
248 char C = *CurPtr;
249 while (C != '\n' && C != '\r') {
250 CurPtr++;
251 if (CurPtr == BufferEnd)
252 return BufferEnd;
253 C = *CurPtr;
254 }
255 // We found a newline, check if it is escaped.
256 const char *EscapePtr = CurPtr - 1;
257 while(isHorizontalWhitespace(*EscapePtr))
258 EscapePtr--;
259
260 if (*EscapePtr == '\\' ||
261 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
262 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
263 // We found an escaped newline.
264 CurPtr = skipNewline(CurPtr, BufferEnd);
265 } else
266 return CurPtr; // Not an escaped newline.
267 }
268 return BufferEnd;
269}
270
271/// Return the one past end pointer for C comments.
272/// Very dumb, does not handle escaped newlines or trigraphs.
273const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
274 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
275 if (*BufferPtr == '*') {
276 assert(BufferPtr + 1 != BufferEnd);
277 if (*(BufferPtr + 1) == '/')
278 return BufferPtr;
279 }
280 }
281 llvm_unreachable("buffer end hit before '*/' was seen");
282}
283} // unnamed namespace
284
285void Lexer::lexCommentText(Token &T) {
286 assert(CommentState == LCS_InsideBCPLComment ||
287 CommentState == LCS_InsideCComment);
288
289 switch (State) {
290 case LS_Normal:
291 break;
292 case LS_VerbatimBlockFirstLine:
293 lexVerbatimBlockFirstLine(T);
294 return;
295 case LS_VerbatimBlockBody:
296 lexVerbatimBlockBody(T);
297 return;
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000298 case LS_VerbatimLineText:
299 lexVerbatimLineText(T);
300 return;
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000301 case LS_HTMLStartTag:
302 lexHTMLStartTag(T);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000303 return;
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000304 case LS_HTMLEndTag:
305 lexHTMLEndTag(T);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000306 return;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000307 }
308
309 assert(State == LS_Normal);
310
311 const char *TokenPtr = BufferPtr;
312 assert(TokenPtr < CommentEnd);
313 while (TokenPtr != CommentEnd) {
314 switch(*TokenPtr) {
315 case '\\':
316 case '@': {
317 TokenPtr++;
318 if (TokenPtr == CommentEnd) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000319 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000320 return;
321 }
322 char C = *TokenPtr;
323 switch (C) {
324 default:
325 break;
326
327 case '\\': case '@': case '&': case '$':
328 case '#': case '<': case '>': case '%':
329 case '\"': case '.': case ':':
330 // This is one of \\ \@ \& \$ etc escape sequences.
331 TokenPtr++;
332 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
333 // This is the \:: escape sequence.
334 TokenPtr++;
335 }
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000336 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000337 formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000338 T.setText(UnescapedText);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000339 return;
340 }
341
342 // Don't make zero-length commands.
343 if (!isCommandNameCharacter(*TokenPtr)) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000344 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000345 return;
346 }
347
348 TokenPtr = skipCommandName(TokenPtr, CommentEnd);
349 unsigned Length = TokenPtr - (BufferPtr + 1);
350
351 // Hardcoded support for lexing LaTeX formula commands
352 // \f$ \f[ \f] \f{ \f} as a single command.
353 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
354 C = *TokenPtr;
355 if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
356 TokenPtr++;
357 Length++;
358 }
359 }
360
361 const StringRef CommandName(BufferPtr + 1, Length);
362 StringRef EndName;
363
Dmitri Gribenkoaa580812012-08-09 00:03:17 +0000364 if (Traits.isVerbatimBlockCommand(CommandName, EndName)) {
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000365 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, EndName);
366 return;
367 }
Dmitri Gribenkoaa580812012-08-09 00:03:17 +0000368 if (Traits.isVerbatimLineCommand(CommandName)) {
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000369 setupAndLexVerbatimLine(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000370 return;
371 }
372 formTokenWithChars(T, TokenPtr, tok::command);
373 T.setCommandName(CommandName);
374 return;
375 }
376
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000377 case '&':
378 lexHTMLCharacterReference(T);
379 return;
380
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000381 case '<': {
382 TokenPtr++;
383 if (TokenPtr == CommentEnd) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000384 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000385 return;
386 }
387 const char C = *TokenPtr;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000388 if (isHTMLIdentifierStartingCharacter(C))
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000389 setupAndLexHTMLStartTag(T);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000390 else if (C == '/')
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000391 setupAndLexHTMLEndTag(T);
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000392 else
393 formTextToken(T, TokenPtr);
394
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000395 return;
396 }
397
398 case '\n':
399 case '\r':
400 TokenPtr = skipNewline(TokenPtr, CommentEnd);
401 formTokenWithChars(T, TokenPtr, tok::newline);
402
403 if (CommentState == LCS_InsideCComment)
404 skipLineStartingDecorations();
405 return;
406
407 default: {
408 while (true) {
409 TokenPtr++;
410 if (TokenPtr == CommentEnd)
411 break;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000412 const char C = *TokenPtr;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000413 if(C == '\n' || C == '\r' ||
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000414 C == '\\' || C == '@' || C == '&' || C == '<')
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000415 break;
416 }
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000417 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000418 return;
419 }
420 }
421 }
422}
423
424void Lexer::setupAndLexVerbatimBlock(Token &T,
425 const char *TextBegin,
426 char Marker, StringRef EndName) {
427 VerbatimBlockEndCommandName.clear();
428 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
429 VerbatimBlockEndCommandName.append(EndName);
430
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000431 StringRef Name(BufferPtr + 1, TextBegin - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000432 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000433 T.setVerbatimBlockName(Name);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000434
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000435 // If there is a newline following the verbatim opening command, skip the
436 // newline so that we don't create an tok::verbatim_block_line with empty
437 // text content.
438 if (BufferPtr != CommentEnd) {
439 const char C = *BufferPtr;
440 if (C == '\n' || C == '\r') {
441 BufferPtr = skipNewline(BufferPtr, CommentEnd);
442 State = LS_VerbatimBlockBody;
443 return;
444 }
445 }
446
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000447 State = LS_VerbatimBlockFirstLine;
448}
449
450void Lexer::lexVerbatimBlockFirstLine(Token &T) {
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000451again:
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000452 assert(BufferPtr < CommentEnd);
453
454 // FIXME: It would be better to scan the text once, finding either the block
455 // end command or newline.
456 //
457 // Extract current line.
458 const char *Newline = findNewline(BufferPtr, CommentEnd);
459 StringRef Line(BufferPtr, Newline - BufferPtr);
460
461 // Look for end command in current line.
462 size_t Pos = Line.find(VerbatimBlockEndCommandName);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000463 const char *TextEnd;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000464 const char *NextLine;
465 if (Pos == StringRef::npos) {
466 // Current line is completely verbatim.
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000467 TextEnd = Newline;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000468 NextLine = skipNewline(Newline, CommentEnd);
469 } else if (Pos == 0) {
470 // Current line contains just an end command.
471 const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000472 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000473 formTokenWithChars(T, End, tok::verbatim_block_end);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000474 T.setVerbatimBlockName(Name);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000475 State = LS_Normal;
476 return;
477 } else {
478 // There is some text, followed by end command. Extract text first.
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000479 TextEnd = BufferPtr + Pos;
480 NextLine = TextEnd;
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000481 // If there is only whitespace before end command, skip whitespace.
482 if (isWhitespace(BufferPtr, TextEnd)) {
483 BufferPtr = TextEnd;
484 goto again;
485 }
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000486 }
487
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000488 StringRef Text(BufferPtr, TextEnd - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000489 formTokenWithChars(T, NextLine, tok::verbatim_block_line);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000490 T.setVerbatimBlockText(Text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000491
492 State = LS_VerbatimBlockBody;
493}
494
495void Lexer::lexVerbatimBlockBody(Token &T) {
496 assert(State == LS_VerbatimBlockBody);
497
498 if (CommentState == LCS_InsideCComment)
499 skipLineStartingDecorations();
500
501 lexVerbatimBlockFirstLine(T);
502}
503
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000504void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin) {
505 const StringRef Name(BufferPtr + 1, TextBegin - BufferPtr - 1);
506 formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
507 T.setVerbatimLineName(Name);
508
509 State = LS_VerbatimLineText;
510}
511
512void Lexer::lexVerbatimLineText(Token &T) {
513 assert(State == LS_VerbatimLineText);
514
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000515 // Extract current line.
516 const char *Newline = findNewline(BufferPtr, CommentEnd);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000517 const StringRef Text(BufferPtr, Newline - BufferPtr);
518 formTokenWithChars(T, Newline, tok::verbatim_line_text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000519 T.setVerbatimLineText(Text);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000520
521 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000522}
523
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000524void Lexer::lexHTMLCharacterReference(Token &T) {
525 const char *TokenPtr = BufferPtr;
526 assert(*TokenPtr == '&');
527 TokenPtr++;
528 if (TokenPtr == CommentEnd) {
529 formTextToken(T, TokenPtr);
530 return;
531 }
532 const char *NamePtr;
533 bool isNamed = false;
534 bool isDecimal = false;
535 char C = *TokenPtr;
536 if (isHTMLNamedCharacterReferenceCharacter(C)) {
537 NamePtr = TokenPtr;
538 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
539 isNamed = true;
540 } else if (C == '#') {
541 TokenPtr++;
542 if (TokenPtr == CommentEnd) {
543 formTextToken(T, TokenPtr);
544 return;
545 }
546 C = *TokenPtr;
547 if (isHTMLDecimalCharacterReferenceCharacter(C)) {
548 NamePtr = TokenPtr;
549 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
550 isDecimal = true;
551 } else if (C == 'x' || C == 'X') {
552 TokenPtr++;
553 NamePtr = TokenPtr;
554 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
555 } else {
556 formTextToken(T, TokenPtr);
557 return;
558 }
559 } else {
560 formTextToken(T, TokenPtr);
561 return;
562 }
563 if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
564 *TokenPtr != ';') {
565 formTextToken(T, TokenPtr);
566 return;
567 }
568 StringRef Name(NamePtr, TokenPtr - NamePtr);
569 TokenPtr++; // Skip semicolon.
570 StringRef Resolved;
571 if (isNamed)
572 Resolved = resolveHTMLNamedCharacterReference(Name);
573 else if (isDecimal)
574 Resolved = resolveHTMLDecimalCharacterReference(Name);
575 else
576 Resolved = resolveHTMLHexCharacterReference(Name);
577
578 if (Resolved.empty()) {
579 formTextToken(T, TokenPtr);
580 return;
581 }
582 formTokenWithChars(T, TokenPtr, tok::text);
583 T.setText(Resolved);
584 return;
585}
586
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000587void Lexer::setupAndLexHTMLStartTag(Token &T) {
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000588 assert(BufferPtr[0] == '<' &&
589 isHTMLIdentifierStartingCharacter(BufferPtr[1]));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000590 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000591 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000592 if (!isHTMLTagName(Name)) {
593 formTextToken(T, TagNameEnd);
594 return;
595 }
596
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000597 formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
598 T.setHTMLTagStartName(Name);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000599
600 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
601
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000602 const char C = *BufferPtr;
603 if (BufferPtr != CommentEnd &&
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000604 (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000605 State = LS_HTMLStartTag;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000606}
607
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000608void Lexer::lexHTMLStartTag(Token &T) {
609 assert(State == LS_HTMLStartTag);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000610
611 const char *TokenPtr = BufferPtr;
612 char C = *TokenPtr;
613 if (isHTMLIdentifierCharacter(C)) {
614 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000615 StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000616 formTokenWithChars(T, TokenPtr, tok::html_ident);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000617 T.setHTMLIdent(Ident);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000618 } else {
619 switch (C) {
620 case '=':
621 TokenPtr++;
622 formTokenWithChars(T, TokenPtr, tok::html_equals);
623 break;
624 case '\"':
625 case '\'': {
626 const char *OpenQuote = TokenPtr;
627 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
628 const char *ClosingQuote = TokenPtr;
629 if (TokenPtr != CommentEnd) // Skip closing quote.
630 TokenPtr++;
631 formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
632 T.setHTMLQuotedString(StringRef(OpenQuote + 1,
633 ClosingQuote - (OpenQuote + 1)));
634 break;
635 }
636 case '>':
637 TokenPtr++;
638 formTokenWithChars(T, TokenPtr, tok::html_greater);
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000639 State = LS_Normal;
640 return;
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000641 case '/':
642 TokenPtr++;
643 if (TokenPtr != CommentEnd && *TokenPtr == '>') {
644 TokenPtr++;
645 formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000646 } else
647 formTextToken(T, TokenPtr);
648
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000649 State = LS_Normal;
650 return;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000651 }
652 }
653
654 // Now look ahead and return to normal state if we don't see any HTML tokens
655 // ahead.
656 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
657 if (BufferPtr == CommentEnd) {
658 State = LS_Normal;
659 return;
660 }
661
662 C = *BufferPtr;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000663 if (!isHTMLIdentifierStartingCharacter(C) &&
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000664 C != '=' && C != '\"' && C != '\'' && C != '>') {
665 State = LS_Normal;
666 return;
667 }
668}
669
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000670void Lexer::setupAndLexHTMLEndTag(Token &T) {
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000671 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
672
673 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
674 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000675 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
676 if (!isHTMLTagName(Name)) {
677 formTextToken(T, TagNameEnd);
678 return;
679 }
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000680
681 const char *End = skipWhitespace(TagNameEnd, CommentEnd);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000682
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000683 formTokenWithChars(T, End, tok::html_end_tag);
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000684 T.setHTMLTagEndName(Name);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000685
686 if (BufferPtr != CommentEnd && *BufferPtr == '>')
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000687 State = LS_HTMLEndTag;
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000688}
689
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000690void Lexer::lexHTMLEndTag(Token &T) {
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000691 assert(BufferPtr != CommentEnd && *BufferPtr == '>');
692
693 formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
694 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000695}
696
Dmitri Gribenkoaa580812012-08-09 00:03:17 +0000697Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
Dmitri Gribenkoaf503a62012-08-31 10:35:30 +0000698 SourceLocation FileLoc,
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000699 const char *BufferStart, const char *BufferEnd):
Dmitri Gribenkoaa580812012-08-09 00:03:17 +0000700 Allocator(Allocator), Traits(Traits),
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000701 BufferStart(BufferStart), BufferEnd(BufferEnd),
Dmitri Gribenkoaf503a62012-08-31 10:35:30 +0000702 FileLoc(FileLoc), BufferPtr(BufferStart),
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000703 CommentState(LCS_BeforeComment), State(LS_Normal) {
704}
705
706void Lexer::lex(Token &T) {
707again:
708 switch (CommentState) {
709 case LCS_BeforeComment:
710 if (BufferPtr == BufferEnd) {
711 formTokenWithChars(T, BufferPtr, tok::eof);
712 return;
713 }
714
715 assert(*BufferPtr == '/');
716 BufferPtr++; // Skip first slash.
717 switch(*BufferPtr) {
718 case '/': { // BCPL comment.
719 BufferPtr++; // Skip second slash.
720
721 if (BufferPtr != BufferEnd) {
722 // Skip Doxygen magic marker, if it is present.
723 // It might be missing because of a typo //< or /*<, or because we
724 // merged this non-Doxygen comment into a bunch of Doxygen comments
725 // around it: /** ... */ /* ... */ /** ... */
726 const char C = *BufferPtr;
727 if (C == '/' || C == '!')
728 BufferPtr++;
729 }
730
731 // Skip less-than symbol that marks trailing comments.
732 // Skip it even if the comment is not a Doxygen one, because //< and /*<
733 // are frequent typos.
734 if (BufferPtr != BufferEnd && *BufferPtr == '<')
735 BufferPtr++;
736
737 CommentState = LCS_InsideBCPLComment;
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000738 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
739 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000740 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
741 goto again;
742 }
743 case '*': { // C comment.
744 BufferPtr++; // Skip star.
745
746 // Skip Doxygen magic marker.
747 const char C = *BufferPtr;
748 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
749 BufferPtr++;
750
751 // Skip less-than symbol that marks trailing comments.
752 if (BufferPtr != BufferEnd && *BufferPtr == '<')
753 BufferPtr++;
754
755 CommentState = LCS_InsideCComment;
756 State = LS_Normal;
757 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
758 goto again;
759 }
760 default:
761 llvm_unreachable("second character of comment should be '/' or '*'");
762 }
763
764 case LCS_BetweenComments: {
765 // Consecutive comments are extracted only if there is only whitespace
766 // between them. So we can search for the start of the next comment.
767 const char *EndWhitespace = BufferPtr;
768 while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
769 EndWhitespace++;
770
771 // Turn any whitespace between comments (and there is only whitespace
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000772 // between them -- guaranteed by comment extraction) into a newline. We
773 // have two newlines between C comments in total (first one was synthesized
774 // after a comment).
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000775 formTokenWithChars(T, EndWhitespace, tok::newline);
776
777 CommentState = LCS_BeforeComment;
778 break;
779 }
780
781 case LCS_InsideBCPLComment:
782 case LCS_InsideCComment:
783 if (BufferPtr != CommentEnd) {
784 lexCommentText(T);
785 break;
786 } else {
787 // Skip C comment closing sequence.
788 if (CommentState == LCS_InsideCComment) {
789 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
790 BufferPtr += 2;
791 assert(BufferPtr <= BufferEnd);
792
793 // Synthenize newline just after the C comment, regardless if there is
794 // actually a newline.
795 formTokenWithChars(T, BufferPtr, tok::newline);
796
797 CommentState = LCS_BetweenComments;
798 break;
799 } else {
800 // Don't synthesized a newline after BCPL comment.
801 CommentState = LCS_BetweenComments;
802 goto again;
803 }
804 }
805 }
806}
807
808StringRef Lexer::getSpelling(const Token &Tok,
809 const SourceManager &SourceMgr,
810 bool *Invalid) const {
811 SourceLocation Loc = Tok.getLocation();
812 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
813
814 bool InvalidTemp = false;
815 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
816 if (InvalidTemp) {
817 *Invalid = true;
818 return StringRef();
819 }
820
821 const char *Begin = File.data() + LocInfo.second;
822 return StringRef(Begin, Tok.getLength());
823}
824
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000825} // end namespace comments
826} // end namespace clang
827