blob: c5de09d0b27582b01970082b5d13eba769de6b14 [file] [log] [blame]
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00001#include "clang/AST/CommentLexer.h"
Dmitri Gribenkoaa580812012-08-09 00:03:17 +00002#include "clang/AST/CommentCommandTraits.h"
Dmitri Gribenko477a9f52012-07-27 20:37:06 +00003#include "clang/Basic/ConvertUTF.h"
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00004#include "llvm/ADT/StringSwitch.h"
5#include "llvm/Support/ErrorHandling.h"
6
7namespace clang {
8namespace comments {
9
10void Token::dump(const Lexer &L, const SourceManager &SM) const {
11 llvm::errs() << "comments::Token Kind=" << Kind << " ";
12 Loc.dump(SM);
13 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
14}
15
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000016namespace {
17bool isHTMLNamedCharacterReferenceCharacter(char C) {
18 return (C >= 'a' && C <= 'z') ||
19 (C >= 'A' && C <= 'Z');
20}
21
22bool isHTMLDecimalCharacterReferenceCharacter(char C) {
23 return C >= '0' && C <= '9';
24}
25
26bool isHTMLHexCharacterReferenceCharacter(char C) {
27 return (C >= '0' && C <= '9') ||
28 (C >= 'a' && C <= 'f') ||
29 (C >= 'A' && C <= 'F');
30}
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +000031
Dmitri Gribenkoc24a76e2012-08-31 02:21:44 +000032#include "clang/AST/CommentHTMLTags.inc"
33
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000034} // unnamed namespace
35
36StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
37 return llvm::StringSwitch<StringRef>(Name)
38 .Case("amp", "&")
39 .Case("lt", "<")
40 .Case("gt", ">")
41 .Case("quot", "\"")
42 .Case("apos", "\'")
43 .Default("");
44}
45
46StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
47 unsigned CodePoint = 0;
48 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
49 assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
50 CodePoint *= 10;
51 CodePoint += Name[i] - '0';
52 }
53
54 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
55 char *ResolvedPtr = Resolved;
56 if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
57 return StringRef(Resolved, ResolvedPtr - Resolved);
58 else
59 return StringRef();
60}
61
62StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
63 unsigned CodePoint = 0;
64 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
65 CodePoint *= 16;
66 const char C = Name[i];
67 assert(isHTMLHexCharacterReferenceCharacter(C));
68 if (C >= '0' && C <= '9')
69 CodePoint += Name[i] - '0';
70 else if (C >= 'a' && C <= 'f')
71 CodePoint += Name[i] - 'a' + 10;
72 else
73 CodePoint += Name[i] - 'A' + 10;
74 }
75
76 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
77 char *ResolvedPtr = Resolved;
78 if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
79 return StringRef(Resolved, ResolvedPtr - Resolved);
80 else
81 return StringRef();
82}
83
Dmitri Gribenko2d44d772012-06-26 20:39:18 +000084void Lexer::skipLineStartingDecorations() {
85 // This function should be called only for C comments
86 assert(CommentState == LCS_InsideCComment);
87
88 if (BufferPtr == CommentEnd)
89 return;
90
91 switch (*BufferPtr) {
92 case ' ':
93 case '\t':
94 case '\f':
95 case '\v': {
96 const char *NewBufferPtr = BufferPtr;
97 NewBufferPtr++;
98 if (NewBufferPtr == CommentEnd)
99 return;
100
101 char C = *NewBufferPtr;
102 while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
103 NewBufferPtr++;
104 if (NewBufferPtr == CommentEnd)
105 return;
106 C = *NewBufferPtr;
107 }
108 if (C == '*')
109 BufferPtr = NewBufferPtr + 1;
110 break;
111 }
112 case '*':
113 BufferPtr++;
114 break;
115 }
116}
117
118namespace {
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000119/// Returns pointer to the first newline character in the string.
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000120const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
121 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
122 const char C = *BufferPtr;
123 if (C == '\n' || C == '\r')
124 return BufferPtr;
125 }
126 return BufferEnd;
127}
128
129const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
130 if (BufferPtr == BufferEnd)
131 return BufferPtr;
132
133 if (*BufferPtr == '\n')
134 BufferPtr++;
135 else {
136 assert(*BufferPtr == '\r');
137 BufferPtr++;
138 if (BufferPtr != BufferEnd && *BufferPtr == '\n')
139 BufferPtr++;
140 }
141 return BufferPtr;
142}
143
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000144const char *skipNamedCharacterReference(const char *BufferPtr,
145 const char *BufferEnd) {
146 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
147 if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
148 return BufferPtr;
149 }
150 return BufferEnd;
151}
152
153const char *skipDecimalCharacterReference(const char *BufferPtr,
154 const char *BufferEnd) {
155 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
156 if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
157 return BufferPtr;
158 }
159 return BufferEnd;
160}
161
162const char *skipHexCharacterReference(const char *BufferPtr,
163 const char *BufferEnd) {
164 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
165 if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
166 return BufferPtr;
167 }
168 return BufferEnd;
169}
170
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000171bool isHTMLIdentifierStartingCharacter(char C) {
172 return (C >= 'a' && C <= 'z') ||
173 (C >= 'A' && C <= 'Z');
174}
175
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000176bool isHTMLIdentifierCharacter(char C) {
177 return (C >= 'a' && C <= 'z') ||
178 (C >= 'A' && C <= 'Z') ||
179 (C >= '0' && C <= '9');
180}
181
182const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
183 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
184 if (!isHTMLIdentifierCharacter(*BufferPtr))
185 return BufferPtr;
186 }
187 return BufferEnd;
188}
189
190/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
191/// string allowed.
192///
193/// Returns pointer to closing quote.
194const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
195{
196 const char Quote = *BufferPtr;
197 assert(Quote == '\"' || Quote == '\'');
198
199 BufferPtr++;
200 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
201 const char C = *BufferPtr;
202 if (C == Quote && BufferPtr[-1] != '\\')
203 return BufferPtr;
204 }
205 return BufferEnd;
206}
207
208bool isHorizontalWhitespace(char C) {
209 return C == ' ' || C == '\t' || C == '\f' || C == '\v';
210}
211
212bool isWhitespace(char C) {
213 return C == ' ' || C == '\n' || C == '\r' ||
214 C == '\t' || C == '\f' || C == '\v';
215}
216
217const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
218 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
219 if (!isWhitespace(*BufferPtr))
220 return BufferPtr;
221 }
222 return BufferEnd;
223}
224
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000225bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
226 return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
227}
228
Dmitri Gribenko8c05da32012-09-14 16:35:35 +0000229bool isCommandNameStartCharacter(char C) {
230 return (C >= 'a' && C <= 'z') ||
231 (C >= 'A' && C <= 'Z');
232}
233
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000234bool isCommandNameCharacter(char C) {
235 return (C >= 'a' && C <= 'z') ||
236 (C >= 'A' && C <= 'Z') ||
237 (C >= '0' && C <= '9');
238}
239
240const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
241 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
242 if (!isCommandNameCharacter(*BufferPtr))
243 return BufferPtr;
244 }
245 return BufferEnd;
246}
247
248/// Return the one past end pointer for BCPL comments.
249/// Handles newlines escaped with backslash or trigraph for backslahs.
250const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
251 const char *CurPtr = BufferPtr;
252 while (CurPtr != BufferEnd) {
253 char C = *CurPtr;
254 while (C != '\n' && C != '\r') {
255 CurPtr++;
256 if (CurPtr == BufferEnd)
257 return BufferEnd;
258 C = *CurPtr;
259 }
260 // We found a newline, check if it is escaped.
261 const char *EscapePtr = CurPtr - 1;
262 while(isHorizontalWhitespace(*EscapePtr))
263 EscapePtr--;
264
265 if (*EscapePtr == '\\' ||
266 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
267 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
268 // We found an escaped newline.
269 CurPtr = skipNewline(CurPtr, BufferEnd);
270 } else
271 return CurPtr; // Not an escaped newline.
272 }
273 return BufferEnd;
274}
275
276/// Return the one past end pointer for C comments.
277/// Very dumb, does not handle escaped newlines or trigraphs.
278const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
279 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
280 if (*BufferPtr == '*') {
281 assert(BufferPtr + 1 != BufferEnd);
282 if (*(BufferPtr + 1) == '/')
283 return BufferPtr;
284 }
285 }
286 llvm_unreachable("buffer end hit before '*/' was seen");
287}
288} // unnamed namespace
289
290void Lexer::lexCommentText(Token &T) {
291 assert(CommentState == LCS_InsideBCPLComment ||
292 CommentState == LCS_InsideCComment);
293
294 switch (State) {
295 case LS_Normal:
296 break;
297 case LS_VerbatimBlockFirstLine:
298 lexVerbatimBlockFirstLine(T);
299 return;
300 case LS_VerbatimBlockBody:
301 lexVerbatimBlockBody(T);
302 return;
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000303 case LS_VerbatimLineText:
304 lexVerbatimLineText(T);
305 return;
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000306 case LS_HTMLStartTag:
307 lexHTMLStartTag(T);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000308 return;
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000309 case LS_HTMLEndTag:
310 lexHTMLEndTag(T);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000311 return;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000312 }
313
314 assert(State == LS_Normal);
315
316 const char *TokenPtr = BufferPtr;
317 assert(TokenPtr < CommentEnd);
318 while (TokenPtr != CommentEnd) {
319 switch(*TokenPtr) {
320 case '\\':
321 case '@': {
322 TokenPtr++;
323 if (TokenPtr == CommentEnd) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000324 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000325 return;
326 }
327 char C = *TokenPtr;
328 switch (C) {
329 default:
330 break;
331
332 case '\\': case '@': case '&': case '$':
333 case '#': case '<': case '>': case '%':
334 case '\"': case '.': case ':':
335 // This is one of \\ \@ \& \$ etc escape sequences.
336 TokenPtr++;
337 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
338 // This is the \:: escape sequence.
339 TokenPtr++;
340 }
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000341 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000342 formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000343 T.setText(UnescapedText);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000344 return;
345 }
346
347 // Don't make zero-length commands.
Dmitri Gribenko8c05da32012-09-14 16:35:35 +0000348 if (!isCommandNameStartCharacter(*TokenPtr)) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000349 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000350 return;
351 }
352
353 TokenPtr = skipCommandName(TokenPtr, CommentEnd);
354 unsigned Length = TokenPtr - (BufferPtr + 1);
355
356 // Hardcoded support for lexing LaTeX formula commands
357 // \f$ \f[ \f] \f{ \f} as a single command.
358 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
359 C = *TokenPtr;
360 if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
361 TokenPtr++;
362 Length++;
363 }
364 }
365
366 const StringRef CommandName(BufferPtr + 1, Length);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000367
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000368 const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
369 if (!Info) {
370 formTokenWithChars(T, TokenPtr, tok::unknown_command);
371 T.setUnknownCommandName(CommandName);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000372 return;
373 }
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000374 if (Info->IsVerbatimBlockCommand) {
375 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
376 return;
377 }
378 if (Info->IsVerbatimLineCommand) {
379 setupAndLexVerbatimLine(T, TokenPtr, Info);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000380 return;
381 }
382 formTokenWithChars(T, TokenPtr, tok::command);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000383 T.setCommandID(Info->getID());
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000384 return;
385 }
386
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000387 case '&':
388 lexHTMLCharacterReference(T);
389 return;
390
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000391 case '<': {
392 TokenPtr++;
393 if (TokenPtr == CommentEnd) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000394 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000395 return;
396 }
397 const char C = *TokenPtr;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000398 if (isHTMLIdentifierStartingCharacter(C))
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000399 setupAndLexHTMLStartTag(T);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000400 else if (C == '/')
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000401 setupAndLexHTMLEndTag(T);
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000402 else
403 formTextToken(T, TokenPtr);
404
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000405 return;
406 }
407
408 case '\n':
409 case '\r':
410 TokenPtr = skipNewline(TokenPtr, CommentEnd);
411 formTokenWithChars(T, TokenPtr, tok::newline);
412
413 if (CommentState == LCS_InsideCComment)
414 skipLineStartingDecorations();
415 return;
416
417 default: {
Dmitri Gribenkoaa7dbaf2012-12-30 19:45:46 +0000418 size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
419 find_first_of("\n\r\\@&<");
420 if (End != StringRef::npos)
421 TokenPtr += End;
422 else
423 TokenPtr = CommentEnd;
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000424 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000425 return;
426 }
427 }
428 }
429}
430
431void Lexer::setupAndLexVerbatimBlock(Token &T,
432 const char *TextBegin,
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000433 char Marker, const CommandInfo *Info) {
434 assert(Info->IsVerbatimBlockCommand);
435
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000436 VerbatimBlockEndCommandName.clear();
437 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000438 VerbatimBlockEndCommandName.append(Info->EndCommandName);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000439
440 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000441 T.setVerbatimBlockID(Info->getID());
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000442
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000443 // If there is a newline following the verbatim opening command, skip the
444 // newline so that we don't create an tok::verbatim_block_line with empty
445 // text content.
446 if (BufferPtr != CommentEnd) {
447 const char C = *BufferPtr;
448 if (C == '\n' || C == '\r') {
449 BufferPtr = skipNewline(BufferPtr, CommentEnd);
450 State = LS_VerbatimBlockBody;
451 return;
452 }
453 }
454
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000455 State = LS_VerbatimBlockFirstLine;
456}
457
458void Lexer::lexVerbatimBlockFirstLine(Token &T) {
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000459again:
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000460 assert(BufferPtr < CommentEnd);
461
462 // FIXME: It would be better to scan the text once, finding either the block
463 // end command or newline.
464 //
465 // Extract current line.
466 const char *Newline = findNewline(BufferPtr, CommentEnd);
467 StringRef Line(BufferPtr, Newline - BufferPtr);
468
469 // Look for end command in current line.
470 size_t Pos = Line.find(VerbatimBlockEndCommandName);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000471 const char *TextEnd;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000472 const char *NextLine;
473 if (Pos == StringRef::npos) {
474 // Current line is completely verbatim.
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000475 TextEnd = Newline;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000476 NextLine = skipNewline(Newline, CommentEnd);
477 } else if (Pos == 0) {
478 // Current line contains just an end command.
479 const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000480 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000481 formTokenWithChars(T, End, tok::verbatim_block_end);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000482 T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000483 State = LS_Normal;
484 return;
485 } else {
486 // There is some text, followed by end command. Extract text first.
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000487 TextEnd = BufferPtr + Pos;
488 NextLine = TextEnd;
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000489 // If there is only whitespace before end command, skip whitespace.
490 if (isWhitespace(BufferPtr, TextEnd)) {
491 BufferPtr = TextEnd;
492 goto again;
493 }
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000494 }
495
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000496 StringRef Text(BufferPtr, TextEnd - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000497 formTokenWithChars(T, NextLine, tok::verbatim_block_line);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000498 T.setVerbatimBlockText(Text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000499
500 State = LS_VerbatimBlockBody;
501}
502
503void Lexer::lexVerbatimBlockBody(Token &T) {
504 assert(State == LS_VerbatimBlockBody);
505
506 if (CommentState == LCS_InsideCComment)
507 skipLineStartingDecorations();
508
509 lexVerbatimBlockFirstLine(T);
510}
511
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000512void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
513 const CommandInfo *Info) {
514 assert(Info->IsVerbatimLineCommand);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000515 formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000516 T.setVerbatimLineID(Info->getID());
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000517
518 State = LS_VerbatimLineText;
519}
520
521void Lexer::lexVerbatimLineText(Token &T) {
522 assert(State == LS_VerbatimLineText);
523
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000524 // Extract current line.
525 const char *Newline = findNewline(BufferPtr, CommentEnd);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000526 const StringRef Text(BufferPtr, Newline - BufferPtr);
527 formTokenWithChars(T, Newline, tok::verbatim_line_text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000528 T.setVerbatimLineText(Text);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000529
530 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000531}
532
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000533void Lexer::lexHTMLCharacterReference(Token &T) {
534 const char *TokenPtr = BufferPtr;
535 assert(*TokenPtr == '&');
536 TokenPtr++;
537 if (TokenPtr == CommentEnd) {
538 formTextToken(T, TokenPtr);
539 return;
540 }
541 const char *NamePtr;
542 bool isNamed = false;
543 bool isDecimal = false;
544 char C = *TokenPtr;
545 if (isHTMLNamedCharacterReferenceCharacter(C)) {
546 NamePtr = TokenPtr;
547 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
548 isNamed = true;
549 } else if (C == '#') {
550 TokenPtr++;
551 if (TokenPtr == CommentEnd) {
552 formTextToken(T, TokenPtr);
553 return;
554 }
555 C = *TokenPtr;
556 if (isHTMLDecimalCharacterReferenceCharacter(C)) {
557 NamePtr = TokenPtr;
558 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
559 isDecimal = true;
560 } else if (C == 'x' || C == 'X') {
561 TokenPtr++;
562 NamePtr = TokenPtr;
563 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
564 } else {
565 formTextToken(T, TokenPtr);
566 return;
567 }
568 } else {
569 formTextToken(T, TokenPtr);
570 return;
571 }
572 if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
573 *TokenPtr != ';') {
574 formTextToken(T, TokenPtr);
575 return;
576 }
577 StringRef Name(NamePtr, TokenPtr - NamePtr);
578 TokenPtr++; // Skip semicolon.
579 StringRef Resolved;
580 if (isNamed)
581 Resolved = resolveHTMLNamedCharacterReference(Name);
582 else if (isDecimal)
583 Resolved = resolveHTMLDecimalCharacterReference(Name);
584 else
585 Resolved = resolveHTMLHexCharacterReference(Name);
586
587 if (Resolved.empty()) {
588 formTextToken(T, TokenPtr);
589 return;
590 }
591 formTokenWithChars(T, TokenPtr, tok::text);
592 T.setText(Resolved);
593 return;
594}
595
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000596void Lexer::setupAndLexHTMLStartTag(Token &T) {
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000597 assert(BufferPtr[0] == '<' &&
598 isHTMLIdentifierStartingCharacter(BufferPtr[1]));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000599 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000600 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000601 if (!isHTMLTagName(Name)) {
602 formTextToken(T, TagNameEnd);
603 return;
604 }
605
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000606 formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
607 T.setHTMLTagStartName(Name);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000608
609 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
610
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000611 const char C = *BufferPtr;
612 if (BufferPtr != CommentEnd &&
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000613 (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000614 State = LS_HTMLStartTag;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000615}
616
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000617void Lexer::lexHTMLStartTag(Token &T) {
618 assert(State == LS_HTMLStartTag);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000619
620 const char *TokenPtr = BufferPtr;
621 char C = *TokenPtr;
622 if (isHTMLIdentifierCharacter(C)) {
623 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000624 StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000625 formTokenWithChars(T, TokenPtr, tok::html_ident);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000626 T.setHTMLIdent(Ident);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000627 } else {
628 switch (C) {
629 case '=':
630 TokenPtr++;
631 formTokenWithChars(T, TokenPtr, tok::html_equals);
632 break;
633 case '\"':
634 case '\'': {
635 const char *OpenQuote = TokenPtr;
636 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
637 const char *ClosingQuote = TokenPtr;
638 if (TokenPtr != CommentEnd) // Skip closing quote.
639 TokenPtr++;
640 formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
641 T.setHTMLQuotedString(StringRef(OpenQuote + 1,
642 ClosingQuote - (OpenQuote + 1)));
643 break;
644 }
645 case '>':
646 TokenPtr++;
647 formTokenWithChars(T, TokenPtr, tok::html_greater);
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000648 State = LS_Normal;
649 return;
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000650 case '/':
651 TokenPtr++;
652 if (TokenPtr != CommentEnd && *TokenPtr == '>') {
653 TokenPtr++;
654 formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000655 } else
656 formTextToken(T, TokenPtr);
657
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000658 State = LS_Normal;
659 return;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000660 }
661 }
662
663 // Now look ahead and return to normal state if we don't see any HTML tokens
664 // ahead.
665 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
666 if (BufferPtr == CommentEnd) {
667 State = LS_Normal;
668 return;
669 }
670
671 C = *BufferPtr;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000672 if (!isHTMLIdentifierStartingCharacter(C) &&
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000673 C != '=' && C != '\"' && C != '\'' && C != '>') {
674 State = LS_Normal;
675 return;
676 }
677}
678
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000679void Lexer::setupAndLexHTMLEndTag(Token &T) {
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000680 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
681
682 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
683 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000684 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
685 if (!isHTMLTagName(Name)) {
686 formTextToken(T, TagNameEnd);
687 return;
688 }
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000689
690 const char *End = skipWhitespace(TagNameEnd, CommentEnd);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000691
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000692 formTokenWithChars(T, End, tok::html_end_tag);
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000693 T.setHTMLTagEndName(Name);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000694
695 if (BufferPtr != CommentEnd && *BufferPtr == '>')
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000696 State = LS_HTMLEndTag;
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000697}
698
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000699void Lexer::lexHTMLEndTag(Token &T) {
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000700 assert(BufferPtr != CommentEnd && *BufferPtr == '>');
701
702 formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
703 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000704}
705
Dmitri Gribenkoaa580812012-08-09 00:03:17 +0000706Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
Dmitri Gribenkoaf503a62012-08-31 10:35:30 +0000707 SourceLocation FileLoc,
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000708 const char *BufferStart, const char *BufferEnd):
Dmitri Gribenkoaa580812012-08-09 00:03:17 +0000709 Allocator(Allocator), Traits(Traits),
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000710 BufferStart(BufferStart), BufferEnd(BufferEnd),
Dmitri Gribenkoaf503a62012-08-31 10:35:30 +0000711 FileLoc(FileLoc), BufferPtr(BufferStart),
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000712 CommentState(LCS_BeforeComment), State(LS_Normal) {
713}
714
715void Lexer::lex(Token &T) {
716again:
717 switch (CommentState) {
718 case LCS_BeforeComment:
719 if (BufferPtr == BufferEnd) {
720 formTokenWithChars(T, BufferPtr, tok::eof);
721 return;
722 }
723
724 assert(*BufferPtr == '/');
725 BufferPtr++; // Skip first slash.
726 switch(*BufferPtr) {
727 case '/': { // BCPL comment.
728 BufferPtr++; // Skip second slash.
729
730 if (BufferPtr != BufferEnd) {
731 // Skip Doxygen magic marker, if it is present.
732 // It might be missing because of a typo //< or /*<, or because we
733 // merged this non-Doxygen comment into a bunch of Doxygen comments
734 // around it: /** ... */ /* ... */ /** ... */
735 const char C = *BufferPtr;
736 if (C == '/' || C == '!')
737 BufferPtr++;
738 }
739
740 // Skip less-than symbol that marks trailing comments.
741 // Skip it even if the comment is not a Doxygen one, because //< and /*<
742 // are frequent typos.
743 if (BufferPtr != BufferEnd && *BufferPtr == '<')
744 BufferPtr++;
745
746 CommentState = LCS_InsideBCPLComment;
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000747 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
748 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000749 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
750 goto again;
751 }
752 case '*': { // C comment.
753 BufferPtr++; // Skip star.
754
755 // Skip Doxygen magic marker.
756 const char C = *BufferPtr;
757 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
758 BufferPtr++;
759
760 // Skip less-than symbol that marks trailing comments.
761 if (BufferPtr != BufferEnd && *BufferPtr == '<')
762 BufferPtr++;
763
764 CommentState = LCS_InsideCComment;
765 State = LS_Normal;
766 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
767 goto again;
768 }
769 default:
770 llvm_unreachable("second character of comment should be '/' or '*'");
771 }
772
773 case LCS_BetweenComments: {
774 // Consecutive comments are extracted only if there is only whitespace
775 // between them. So we can search for the start of the next comment.
776 const char *EndWhitespace = BufferPtr;
777 while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
778 EndWhitespace++;
779
780 // Turn any whitespace between comments (and there is only whitespace
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000781 // between them -- guaranteed by comment extraction) into a newline. We
782 // have two newlines between C comments in total (first one was synthesized
783 // after a comment).
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000784 formTokenWithChars(T, EndWhitespace, tok::newline);
785
786 CommentState = LCS_BeforeComment;
787 break;
788 }
789
790 case LCS_InsideBCPLComment:
791 case LCS_InsideCComment:
792 if (BufferPtr != CommentEnd) {
793 lexCommentText(T);
794 break;
795 } else {
796 // Skip C comment closing sequence.
797 if (CommentState == LCS_InsideCComment) {
798 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
799 BufferPtr += 2;
800 assert(BufferPtr <= BufferEnd);
801
802 // Synthenize newline just after the C comment, regardless if there is
803 // actually a newline.
804 formTokenWithChars(T, BufferPtr, tok::newline);
805
806 CommentState = LCS_BetweenComments;
807 break;
808 } else {
809 // Don't synthesized a newline after BCPL comment.
810 CommentState = LCS_BetweenComments;
811 goto again;
812 }
813 }
814 }
815}
816
817StringRef Lexer::getSpelling(const Token &Tok,
818 const SourceManager &SourceMgr,
819 bool *Invalid) const {
820 SourceLocation Loc = Tok.getLocation();
821 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
822
823 bool InvalidTemp = false;
824 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
825 if (InvalidTemp) {
826 *Invalid = true;
827 return StringRef();
828 }
829
830 const char *Begin = File.data() + LocInfo.second;
831 return StringRef(Begin, Tok.getLength());
832}
833
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000834} // end namespace comments
835} // end namespace clang
836