blob: 31a09f71d993c643e4b8b80ad1edf26a74c3a11f [file] [log] [blame]
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00001#include "clang/AST/CommentLexer.h"
Dmitri Gribenkoaa580812012-08-09 00:03:17 +00002#include "clang/AST/CommentCommandTraits.h"
Dmitri Gribenko477a9f52012-07-27 20:37:06 +00003#include "clang/Basic/ConvertUTF.h"
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00004#include "llvm/ADT/StringSwitch.h"
5#include "llvm/Support/ErrorHandling.h"
6
7namespace clang {
8namespace comments {
9
10void Token::dump(const Lexer &L, const SourceManager &SM) const {
11 llvm::errs() << "comments::Token Kind=" << Kind << " ";
12 Loc.dump(SM);
13 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
14}
15
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000016namespace {
17bool isHTMLNamedCharacterReferenceCharacter(char C) {
18 return (C >= 'a' && C <= 'z') ||
19 (C >= 'A' && C <= 'Z');
20}
21
22bool isHTMLDecimalCharacterReferenceCharacter(char C) {
23 return C >= '0' && C <= '9';
24}
25
26bool isHTMLHexCharacterReferenceCharacter(char C) {
27 return (C >= '0' && C <= '9') ||
28 (C >= 'a' && C <= 'f') ||
29 (C >= 'A' && C <= 'F');
30}
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +000031
Dmitri Gribenkoc24a76e2012-08-31 02:21:44 +000032#include "clang/AST/CommentHTMLTags.inc"
33
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000034} // unnamed namespace
35
36StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
37 return llvm::StringSwitch<StringRef>(Name)
38 .Case("amp", "&")
39 .Case("lt", "<")
40 .Case("gt", ">")
41 .Case("quot", "\"")
42 .Case("apos", "\'")
43 .Default("");
44}
45
46StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
47 unsigned CodePoint = 0;
48 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
49 assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
50 CodePoint *= 10;
51 CodePoint += Name[i] - '0';
52 }
53
54 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
55 char *ResolvedPtr = Resolved;
56 if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
57 return StringRef(Resolved, ResolvedPtr - Resolved);
58 else
59 return StringRef();
60}
61
62StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
63 unsigned CodePoint = 0;
64 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
65 CodePoint *= 16;
66 const char C = Name[i];
67 assert(isHTMLHexCharacterReferenceCharacter(C));
68 if (C >= '0' && C <= '9')
69 CodePoint += Name[i] - '0';
70 else if (C >= 'a' && C <= 'f')
71 CodePoint += Name[i] - 'a' + 10;
72 else
73 CodePoint += Name[i] - 'A' + 10;
74 }
75
76 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
77 char *ResolvedPtr = Resolved;
78 if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
79 return StringRef(Resolved, ResolvedPtr - Resolved);
80 else
81 return StringRef();
82}
83
Dmitri Gribenko2d44d772012-06-26 20:39:18 +000084void Lexer::skipLineStartingDecorations() {
85 // This function should be called only for C comments
86 assert(CommentState == LCS_InsideCComment);
87
88 if (BufferPtr == CommentEnd)
89 return;
90
91 switch (*BufferPtr) {
92 case ' ':
93 case '\t':
94 case '\f':
95 case '\v': {
96 const char *NewBufferPtr = BufferPtr;
97 NewBufferPtr++;
98 if (NewBufferPtr == CommentEnd)
99 return;
100
101 char C = *NewBufferPtr;
102 while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
103 NewBufferPtr++;
104 if (NewBufferPtr == CommentEnd)
105 return;
106 C = *NewBufferPtr;
107 }
108 if (C == '*')
109 BufferPtr = NewBufferPtr + 1;
110 break;
111 }
112 case '*':
113 BufferPtr++;
114 break;
115 }
116}
117
118namespace {
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000119/// Returns pointer to the first newline character in the string.
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000120const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
121 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
122 const char C = *BufferPtr;
123 if (C == '\n' || C == '\r')
124 return BufferPtr;
125 }
126 return BufferEnd;
127}
128
129const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
130 if (BufferPtr == BufferEnd)
131 return BufferPtr;
132
133 if (*BufferPtr == '\n')
134 BufferPtr++;
135 else {
136 assert(*BufferPtr == '\r');
137 BufferPtr++;
138 if (BufferPtr != BufferEnd && *BufferPtr == '\n')
139 BufferPtr++;
140 }
141 return BufferPtr;
142}
143
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000144const char *skipNamedCharacterReference(const char *BufferPtr,
145 const char *BufferEnd) {
146 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
147 if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
148 return BufferPtr;
149 }
150 return BufferEnd;
151}
152
153const char *skipDecimalCharacterReference(const char *BufferPtr,
154 const char *BufferEnd) {
155 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
156 if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
157 return BufferPtr;
158 }
159 return BufferEnd;
160}
161
162const char *skipHexCharacterReference(const char *BufferPtr,
163 const char *BufferEnd) {
164 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
165 if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
166 return BufferPtr;
167 }
168 return BufferEnd;
169}
170
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000171bool isHTMLIdentifierStartingCharacter(char C) {
172 return (C >= 'a' && C <= 'z') ||
173 (C >= 'A' && C <= 'Z');
174}
175
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000176bool isHTMLIdentifierCharacter(char C) {
177 return (C >= 'a' && C <= 'z') ||
178 (C >= 'A' && C <= 'Z') ||
179 (C >= '0' && C <= '9');
180}
181
182const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
183 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
184 if (!isHTMLIdentifierCharacter(*BufferPtr))
185 return BufferPtr;
186 }
187 return BufferEnd;
188}
189
190/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
191/// string allowed.
192///
193/// Returns pointer to closing quote.
194const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
195{
196 const char Quote = *BufferPtr;
197 assert(Quote == '\"' || Quote == '\'');
198
199 BufferPtr++;
200 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
201 const char C = *BufferPtr;
202 if (C == Quote && BufferPtr[-1] != '\\')
203 return BufferPtr;
204 }
205 return BufferEnd;
206}
207
208bool isHorizontalWhitespace(char C) {
209 return C == ' ' || C == '\t' || C == '\f' || C == '\v';
210}
211
212bool isWhitespace(char C) {
213 return C == ' ' || C == '\n' || C == '\r' ||
214 C == '\t' || C == '\f' || C == '\v';
215}
216
217const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
218 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
219 if (!isWhitespace(*BufferPtr))
220 return BufferPtr;
221 }
222 return BufferEnd;
223}
224
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000225bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
226 return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
227}
228
Dmitri Gribenko8c05da32012-09-14 16:35:35 +0000229bool isCommandNameStartCharacter(char C) {
230 return (C >= 'a' && C <= 'z') ||
231 (C >= 'A' && C <= 'Z');
232}
233
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000234bool isCommandNameCharacter(char C) {
235 return (C >= 'a' && C <= 'z') ||
236 (C >= 'A' && C <= 'Z') ||
237 (C >= '0' && C <= '9');
238}
239
240const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
241 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
242 if (!isCommandNameCharacter(*BufferPtr))
243 return BufferPtr;
244 }
245 return BufferEnd;
246}
247
248/// Return the one past end pointer for BCPL comments.
249/// Handles newlines escaped with backslash or trigraph for backslahs.
250const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
251 const char *CurPtr = BufferPtr;
252 while (CurPtr != BufferEnd) {
253 char C = *CurPtr;
254 while (C != '\n' && C != '\r') {
255 CurPtr++;
256 if (CurPtr == BufferEnd)
257 return BufferEnd;
258 C = *CurPtr;
259 }
260 // We found a newline, check if it is escaped.
261 const char *EscapePtr = CurPtr - 1;
262 while(isHorizontalWhitespace(*EscapePtr))
263 EscapePtr--;
264
265 if (*EscapePtr == '\\' ||
266 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
267 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
268 // We found an escaped newline.
269 CurPtr = skipNewline(CurPtr, BufferEnd);
270 } else
271 return CurPtr; // Not an escaped newline.
272 }
273 return BufferEnd;
274}
275
276/// Return the one past end pointer for C comments.
277/// Very dumb, does not handle escaped newlines or trigraphs.
278const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
279 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
280 if (*BufferPtr == '*') {
281 assert(BufferPtr + 1 != BufferEnd);
282 if (*(BufferPtr + 1) == '/')
283 return BufferPtr;
284 }
285 }
286 llvm_unreachable("buffer end hit before '*/' was seen");
287}
288} // unnamed namespace
289
290void Lexer::lexCommentText(Token &T) {
291 assert(CommentState == LCS_InsideBCPLComment ||
292 CommentState == LCS_InsideCComment);
293
294 switch (State) {
295 case LS_Normal:
296 break;
297 case LS_VerbatimBlockFirstLine:
298 lexVerbatimBlockFirstLine(T);
299 return;
300 case LS_VerbatimBlockBody:
301 lexVerbatimBlockBody(T);
302 return;
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000303 case LS_VerbatimLineText:
304 lexVerbatimLineText(T);
305 return;
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000306 case LS_HTMLStartTag:
307 lexHTMLStartTag(T);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000308 return;
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000309 case LS_HTMLEndTag:
310 lexHTMLEndTag(T);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000311 return;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000312 }
313
314 assert(State == LS_Normal);
315
316 const char *TokenPtr = BufferPtr;
317 assert(TokenPtr < CommentEnd);
318 while (TokenPtr != CommentEnd) {
319 switch(*TokenPtr) {
320 case '\\':
321 case '@': {
322 TokenPtr++;
323 if (TokenPtr == CommentEnd) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000324 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000325 return;
326 }
327 char C = *TokenPtr;
328 switch (C) {
329 default:
330 break;
331
332 case '\\': case '@': case '&': case '$':
333 case '#': case '<': case '>': case '%':
334 case '\"': case '.': case ':':
335 // This is one of \\ \@ \& \$ etc escape sequences.
336 TokenPtr++;
337 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
338 // This is the \:: escape sequence.
339 TokenPtr++;
340 }
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000341 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000342 formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000343 T.setText(UnescapedText);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000344 return;
345 }
346
347 // Don't make zero-length commands.
Dmitri Gribenko8c05da32012-09-14 16:35:35 +0000348 if (!isCommandNameStartCharacter(*TokenPtr)) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000349 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000350 return;
351 }
352
353 TokenPtr = skipCommandName(TokenPtr, CommentEnd);
354 unsigned Length = TokenPtr - (BufferPtr + 1);
355
356 // Hardcoded support for lexing LaTeX formula commands
357 // \f$ \f[ \f] \f{ \f} as a single command.
358 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
359 C = *TokenPtr;
360 if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
361 TokenPtr++;
362 Length++;
363 }
364 }
365
366 const StringRef CommandName(BufferPtr + 1, Length);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000367
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000368 const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
369 if (!Info) {
370 formTokenWithChars(T, TokenPtr, tok::unknown_command);
371 T.setUnknownCommandName(CommandName);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000372 return;
373 }
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000374 if (Info->IsVerbatimBlockCommand) {
375 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
376 return;
377 }
378 if (Info->IsVerbatimLineCommand) {
379 setupAndLexVerbatimLine(T, TokenPtr, Info);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000380 return;
381 }
382 formTokenWithChars(T, TokenPtr, tok::command);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000383 T.setCommandID(Info->getID());
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000384 return;
385 }
386
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000387 case '&':
388 lexHTMLCharacterReference(T);
389 return;
390
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000391 case '<': {
392 TokenPtr++;
393 if (TokenPtr == CommentEnd) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000394 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000395 return;
396 }
397 const char C = *TokenPtr;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000398 if (isHTMLIdentifierStartingCharacter(C))
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000399 setupAndLexHTMLStartTag(T);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000400 else if (C == '/')
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000401 setupAndLexHTMLEndTag(T);
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000402 else
403 formTextToken(T, TokenPtr);
404
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000405 return;
406 }
407
408 case '\n':
409 case '\r':
410 TokenPtr = skipNewline(TokenPtr, CommentEnd);
411 formTokenWithChars(T, TokenPtr, tok::newline);
412
413 if (CommentState == LCS_InsideCComment)
414 skipLineStartingDecorations();
415 return;
416
417 default: {
418 while (true) {
419 TokenPtr++;
420 if (TokenPtr == CommentEnd)
421 break;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000422 const char C = *TokenPtr;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000423 if(C == '\n' || C == '\r' ||
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000424 C == '\\' || C == '@' || C == '&' || C == '<')
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000425 break;
426 }
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000427 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000428 return;
429 }
430 }
431 }
432}
433
434void Lexer::setupAndLexVerbatimBlock(Token &T,
435 const char *TextBegin,
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000436 char Marker, const CommandInfo *Info) {
437 assert(Info->IsVerbatimBlockCommand);
438
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000439 VerbatimBlockEndCommandName.clear();
440 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000441 VerbatimBlockEndCommandName.append(Info->EndCommandName);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000442
443 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000444 T.setVerbatimBlockID(Info->getID());
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000445
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000446 // If there is a newline following the verbatim opening command, skip the
447 // newline so that we don't create an tok::verbatim_block_line with empty
448 // text content.
449 if (BufferPtr != CommentEnd) {
450 const char C = *BufferPtr;
451 if (C == '\n' || C == '\r') {
452 BufferPtr = skipNewline(BufferPtr, CommentEnd);
453 State = LS_VerbatimBlockBody;
454 return;
455 }
456 }
457
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000458 State = LS_VerbatimBlockFirstLine;
459}
460
461void Lexer::lexVerbatimBlockFirstLine(Token &T) {
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000462again:
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000463 assert(BufferPtr < CommentEnd);
464
465 // FIXME: It would be better to scan the text once, finding either the block
466 // end command or newline.
467 //
468 // Extract current line.
469 const char *Newline = findNewline(BufferPtr, CommentEnd);
470 StringRef Line(BufferPtr, Newline - BufferPtr);
471
472 // Look for end command in current line.
473 size_t Pos = Line.find(VerbatimBlockEndCommandName);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000474 const char *TextEnd;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000475 const char *NextLine;
476 if (Pos == StringRef::npos) {
477 // Current line is completely verbatim.
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000478 TextEnd = Newline;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000479 NextLine = skipNewline(Newline, CommentEnd);
480 } else if (Pos == 0) {
481 // Current line contains just an end command.
482 const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000483 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000484 formTokenWithChars(T, End, tok::verbatim_block_end);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000485 T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000486 State = LS_Normal;
487 return;
488 } else {
489 // There is some text, followed by end command. Extract text first.
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000490 TextEnd = BufferPtr + Pos;
491 NextLine = TextEnd;
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000492 // If there is only whitespace before end command, skip whitespace.
493 if (isWhitespace(BufferPtr, TextEnd)) {
494 BufferPtr = TextEnd;
495 goto again;
496 }
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000497 }
498
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000499 StringRef Text(BufferPtr, TextEnd - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000500 formTokenWithChars(T, NextLine, tok::verbatim_block_line);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000501 T.setVerbatimBlockText(Text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000502
503 State = LS_VerbatimBlockBody;
504}
505
506void Lexer::lexVerbatimBlockBody(Token &T) {
507 assert(State == LS_VerbatimBlockBody);
508
509 if (CommentState == LCS_InsideCComment)
510 skipLineStartingDecorations();
511
512 lexVerbatimBlockFirstLine(T);
513}
514
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000515void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
516 const CommandInfo *Info) {
517 assert(Info->IsVerbatimLineCommand);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000518 formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000519 T.setVerbatimLineID(Info->getID());
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000520
521 State = LS_VerbatimLineText;
522}
523
524void Lexer::lexVerbatimLineText(Token &T) {
525 assert(State == LS_VerbatimLineText);
526
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000527 // Extract current line.
528 const char *Newline = findNewline(BufferPtr, CommentEnd);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000529 const StringRef Text(BufferPtr, Newline - BufferPtr);
530 formTokenWithChars(T, Newline, tok::verbatim_line_text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000531 T.setVerbatimLineText(Text);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000532
533 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000534}
535
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000536void Lexer::lexHTMLCharacterReference(Token &T) {
537 const char *TokenPtr = BufferPtr;
538 assert(*TokenPtr == '&');
539 TokenPtr++;
540 if (TokenPtr == CommentEnd) {
541 formTextToken(T, TokenPtr);
542 return;
543 }
544 const char *NamePtr;
545 bool isNamed = false;
546 bool isDecimal = false;
547 char C = *TokenPtr;
548 if (isHTMLNamedCharacterReferenceCharacter(C)) {
549 NamePtr = TokenPtr;
550 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
551 isNamed = true;
552 } else if (C == '#') {
553 TokenPtr++;
554 if (TokenPtr == CommentEnd) {
555 formTextToken(T, TokenPtr);
556 return;
557 }
558 C = *TokenPtr;
559 if (isHTMLDecimalCharacterReferenceCharacter(C)) {
560 NamePtr = TokenPtr;
561 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
562 isDecimal = true;
563 } else if (C == 'x' || C == 'X') {
564 TokenPtr++;
565 NamePtr = TokenPtr;
566 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
567 } else {
568 formTextToken(T, TokenPtr);
569 return;
570 }
571 } else {
572 formTextToken(T, TokenPtr);
573 return;
574 }
575 if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
576 *TokenPtr != ';') {
577 formTextToken(T, TokenPtr);
578 return;
579 }
580 StringRef Name(NamePtr, TokenPtr - NamePtr);
581 TokenPtr++; // Skip semicolon.
582 StringRef Resolved;
583 if (isNamed)
584 Resolved = resolveHTMLNamedCharacterReference(Name);
585 else if (isDecimal)
586 Resolved = resolveHTMLDecimalCharacterReference(Name);
587 else
588 Resolved = resolveHTMLHexCharacterReference(Name);
589
590 if (Resolved.empty()) {
591 formTextToken(T, TokenPtr);
592 return;
593 }
594 formTokenWithChars(T, TokenPtr, tok::text);
595 T.setText(Resolved);
596 return;
597}
598
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000599void Lexer::setupAndLexHTMLStartTag(Token &T) {
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000600 assert(BufferPtr[0] == '<' &&
601 isHTMLIdentifierStartingCharacter(BufferPtr[1]));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000602 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000603 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000604 if (!isHTMLTagName(Name)) {
605 formTextToken(T, TagNameEnd);
606 return;
607 }
608
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000609 formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
610 T.setHTMLTagStartName(Name);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000611
612 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
613
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000614 const char C = *BufferPtr;
615 if (BufferPtr != CommentEnd &&
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000616 (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000617 State = LS_HTMLStartTag;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000618}
619
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000620void Lexer::lexHTMLStartTag(Token &T) {
621 assert(State == LS_HTMLStartTag);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000622
623 const char *TokenPtr = BufferPtr;
624 char C = *TokenPtr;
625 if (isHTMLIdentifierCharacter(C)) {
626 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000627 StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000628 formTokenWithChars(T, TokenPtr, tok::html_ident);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000629 T.setHTMLIdent(Ident);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000630 } else {
631 switch (C) {
632 case '=':
633 TokenPtr++;
634 formTokenWithChars(T, TokenPtr, tok::html_equals);
635 break;
636 case '\"':
637 case '\'': {
638 const char *OpenQuote = TokenPtr;
639 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
640 const char *ClosingQuote = TokenPtr;
641 if (TokenPtr != CommentEnd) // Skip closing quote.
642 TokenPtr++;
643 formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
644 T.setHTMLQuotedString(StringRef(OpenQuote + 1,
645 ClosingQuote - (OpenQuote + 1)));
646 break;
647 }
648 case '>':
649 TokenPtr++;
650 formTokenWithChars(T, TokenPtr, tok::html_greater);
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000651 State = LS_Normal;
652 return;
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000653 case '/':
654 TokenPtr++;
655 if (TokenPtr != CommentEnd && *TokenPtr == '>') {
656 TokenPtr++;
657 formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000658 } else
659 formTextToken(T, TokenPtr);
660
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000661 State = LS_Normal;
662 return;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000663 }
664 }
665
666 // Now look ahead and return to normal state if we don't see any HTML tokens
667 // ahead.
668 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
669 if (BufferPtr == CommentEnd) {
670 State = LS_Normal;
671 return;
672 }
673
674 C = *BufferPtr;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000675 if (!isHTMLIdentifierStartingCharacter(C) &&
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000676 C != '=' && C != '\"' && C != '\'' && C != '>') {
677 State = LS_Normal;
678 return;
679 }
680}
681
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000682void Lexer::setupAndLexHTMLEndTag(Token &T) {
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000683 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
684
685 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
686 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000687 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
688 if (!isHTMLTagName(Name)) {
689 formTextToken(T, TagNameEnd);
690 return;
691 }
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000692
693 const char *End = skipWhitespace(TagNameEnd, CommentEnd);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000694
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000695 formTokenWithChars(T, End, tok::html_end_tag);
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000696 T.setHTMLTagEndName(Name);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000697
698 if (BufferPtr != CommentEnd && *BufferPtr == '>')
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000699 State = LS_HTMLEndTag;
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000700}
701
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000702void Lexer::lexHTMLEndTag(Token &T) {
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000703 assert(BufferPtr != CommentEnd && *BufferPtr == '>');
704
705 formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
706 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000707}
708
Dmitri Gribenkoaa580812012-08-09 00:03:17 +0000709Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
Dmitri Gribenkoaf503a62012-08-31 10:35:30 +0000710 SourceLocation FileLoc,
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000711 const char *BufferStart, const char *BufferEnd):
Dmitri Gribenkoaa580812012-08-09 00:03:17 +0000712 Allocator(Allocator), Traits(Traits),
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000713 BufferStart(BufferStart), BufferEnd(BufferEnd),
Dmitri Gribenkoaf503a62012-08-31 10:35:30 +0000714 FileLoc(FileLoc), BufferPtr(BufferStart),
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000715 CommentState(LCS_BeforeComment), State(LS_Normal) {
716}
717
718void Lexer::lex(Token &T) {
719again:
720 switch (CommentState) {
721 case LCS_BeforeComment:
722 if (BufferPtr == BufferEnd) {
723 formTokenWithChars(T, BufferPtr, tok::eof);
724 return;
725 }
726
727 assert(*BufferPtr == '/');
728 BufferPtr++; // Skip first slash.
729 switch(*BufferPtr) {
730 case '/': { // BCPL comment.
731 BufferPtr++; // Skip second slash.
732
733 if (BufferPtr != BufferEnd) {
734 // Skip Doxygen magic marker, if it is present.
735 // It might be missing because of a typo //< or /*<, or because we
736 // merged this non-Doxygen comment into a bunch of Doxygen comments
737 // around it: /** ... */ /* ... */ /** ... */
738 const char C = *BufferPtr;
739 if (C == '/' || C == '!')
740 BufferPtr++;
741 }
742
743 // Skip less-than symbol that marks trailing comments.
744 // Skip it even if the comment is not a Doxygen one, because //< and /*<
745 // are frequent typos.
746 if (BufferPtr != BufferEnd && *BufferPtr == '<')
747 BufferPtr++;
748
749 CommentState = LCS_InsideBCPLComment;
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000750 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
751 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000752 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
753 goto again;
754 }
755 case '*': { // C comment.
756 BufferPtr++; // Skip star.
757
758 // Skip Doxygen magic marker.
759 const char C = *BufferPtr;
760 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
761 BufferPtr++;
762
763 // Skip less-than symbol that marks trailing comments.
764 if (BufferPtr != BufferEnd && *BufferPtr == '<')
765 BufferPtr++;
766
767 CommentState = LCS_InsideCComment;
768 State = LS_Normal;
769 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
770 goto again;
771 }
772 default:
773 llvm_unreachable("second character of comment should be '/' or '*'");
774 }
775
776 case LCS_BetweenComments: {
777 // Consecutive comments are extracted only if there is only whitespace
778 // between them. So we can search for the start of the next comment.
779 const char *EndWhitespace = BufferPtr;
780 while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
781 EndWhitespace++;
782
783 // Turn any whitespace between comments (and there is only whitespace
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000784 // between them -- guaranteed by comment extraction) into a newline. We
785 // have two newlines between C comments in total (first one was synthesized
786 // after a comment).
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000787 formTokenWithChars(T, EndWhitespace, tok::newline);
788
789 CommentState = LCS_BeforeComment;
790 break;
791 }
792
793 case LCS_InsideBCPLComment:
794 case LCS_InsideCComment:
795 if (BufferPtr != CommentEnd) {
796 lexCommentText(T);
797 break;
798 } else {
799 // Skip C comment closing sequence.
800 if (CommentState == LCS_InsideCComment) {
801 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
802 BufferPtr += 2;
803 assert(BufferPtr <= BufferEnd);
804
805 // Synthenize newline just after the C comment, regardless if there is
806 // actually a newline.
807 formTokenWithChars(T, BufferPtr, tok::newline);
808
809 CommentState = LCS_BetweenComments;
810 break;
811 } else {
812 // Don't synthesized a newline after BCPL comment.
813 CommentState = LCS_BetweenComments;
814 goto again;
815 }
816 }
817 }
818}
819
820StringRef Lexer::getSpelling(const Token &Tok,
821 const SourceManager &SourceMgr,
822 bool *Invalid) const {
823 SourceLocation Loc = Tok.getLocation();
824 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
825
826 bool InvalidTemp = false;
827 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
828 if (InvalidTemp) {
829 *Invalid = true;
830 return StringRef();
831 }
832
833 const char *Begin = File.data() + LocInfo.second;
834 return StringRef(Begin, Tok.getLength());
835}
836
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000837} // end namespace comments
838} // end namespace clang
839