blob: b92b1fb33dc55506378175a74ad81449c2c22edc [file] [log] [blame]
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00001#include "clang/AST/CommentLexer.h"
Dmitri Gribenkoaa580812012-08-09 00:03:17 +00002#include "clang/AST/CommentCommandTraits.h"
Dmitri Gribenkoc934dfe2013-01-19 22:06:05 +00003#include "llvm/ADT/StringExtras.h"
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00004#include "llvm/ADT/StringSwitch.h"
Dmitri Gribenkocb5620c2013-01-30 12:06:08 +00005#include "llvm/Support/ConvertUTF.h"
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00006#include "llvm/Support/ErrorHandling.h"
7
8namespace clang {
9namespace comments {
10
11void Token::dump(const Lexer &L, const SourceManager &SM) const {
12 llvm::errs() << "comments::Token Kind=" << Kind << " ";
13 Loc.dump(SM);
14 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
15}
16
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000017namespace {
18bool isHTMLNamedCharacterReferenceCharacter(char C) {
19 return (C >= 'a' && C <= 'z') ||
20 (C >= 'A' && C <= 'Z');
21}
22
23bool isHTMLDecimalCharacterReferenceCharacter(char C) {
24 return C >= '0' && C <= '9';
25}
26
27bool isHTMLHexCharacterReferenceCharacter(char C) {
28 return (C >= '0' && C <= '9') ||
29 (C >= 'a' && C <= 'f') ||
30 (C >= 'A' && C <= 'F');
31}
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +000032
Dmitri Gribenkoc24a76e2012-08-31 02:21:44 +000033#include "clang/AST/CommentHTMLTags.inc"
34
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000035} // unnamed namespace
36
Fariborz Jahanian658a1152013-01-29 23:42:26 +000037static unsigned getCodePoint(StringRef Name) {
38 unsigned CodePoint = 0;
39 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
40 CodePoint *= 16;
41 const char C = Name[i];
42 assert(isHTMLHexCharacterReferenceCharacter(C));
43 CodePoint += llvm::hexDigitValue(C);
44 }
45 return CodePoint;
46}
47
48StringRef Lexer::helperResolveHTMLHexCharacterReference(unsigned CodePoint) const {
49 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
50 char *ResolvedPtr = Resolved;
Dmitri Gribenkocb5620c2013-01-30 12:06:08 +000051 if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
Fariborz Jahanian658a1152013-01-29 23:42:26 +000052 return StringRef(Resolved, ResolvedPtr - Resolved);
53 else
54 return StringRef();
55}
56
57StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
58 unsigned CodePoint = getCodePoint(Name);
59 return helperResolveHTMLHexCharacterReference(CodePoint);
60}
61
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000062StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
63 return llvm::StringSwitch<StringRef>(Name)
64 .Case("amp", "&")
65 .Case("lt", "<")
66 .Case("gt", ">")
67 .Case("quot", "\"")
68 .Case("apos", "\'")
69 .Default("");
70}
Fariborz Jahanian658a1152013-01-29 23:42:26 +000071
72StringRef Lexer::HTMLDoxygenCharacterReference(StringRef Name) const {
73 return llvm::StringSwitch<StringRef>(Name)
74 .Case("copy", helperResolveHTMLHexCharacterReference(0x000A9))
75 .Case("trade", helperResolveHTMLHexCharacterReference(0x02122))
76 .Case("reg", helperResolveHTMLHexCharacterReference(0x000AE))
77 .Case("lt", helperResolveHTMLHexCharacterReference(0x0003C))
78 .Case("gt", helperResolveHTMLHexCharacterReference(0x0003C))
79 .Case("amp", helperResolveHTMLHexCharacterReference(0x00026))
80 .Case("apos", helperResolveHTMLHexCharacterReference(0x00027))
81 .Case("quot", helperResolveHTMLHexCharacterReference(0x00022))
82 .Case("lsquo", helperResolveHTMLHexCharacterReference(0x02018))
83 .Case("rsquo", helperResolveHTMLHexCharacterReference(0x02019))
84 .Case("ldquo", helperResolveHTMLHexCharacterReference(0x0201C))
85 .Case("rdquo", helperResolveHTMLHexCharacterReference(0x0201D))
86 .Case("ndash", helperResolveHTMLHexCharacterReference(0x02013))
87 .Case("mdash", helperResolveHTMLHexCharacterReference(0x02014))
88 .Case("Auml", helperResolveHTMLHexCharacterReference(0x000C4))
89 .Case("Euml", helperResolveHTMLHexCharacterReference(0x000CB))
90 .Case("Iuml", helperResolveHTMLHexCharacterReference(0x000CF))
91 .Case("Ouml", helperResolveHTMLHexCharacterReference(0x000D6))
92 .Case("Uuml", helperResolveHTMLHexCharacterReference(0x000DC))
93 .Case("Yuml", helperResolveHTMLHexCharacterReference(0x00178))
94 .Case("auml", helperResolveHTMLHexCharacterReference(0x000E4))
95 .Case("euml", helperResolveHTMLHexCharacterReference(0x000EB))
96 .Case("iuml", helperResolveHTMLHexCharacterReference(0x000EF))
97 .Case("ouml", helperResolveHTMLHexCharacterReference(0x000F6))
98 .Case("uuml", helperResolveHTMLHexCharacterReference(0x000FC))
99 .Case("yuml", helperResolveHTMLHexCharacterReference(0x000FF))
100 .Case("Aacute", helperResolveHTMLHexCharacterReference(0x000C1))
101 .Case("Eacute", helperResolveHTMLHexCharacterReference(0x000C9))
102 .Case("Iacute", helperResolveHTMLHexCharacterReference(0x000CD))
103 .Case("Oacute", helperResolveHTMLHexCharacterReference(0x000D3))
104 .Case("Uacute", helperResolveHTMLHexCharacterReference(0x000DA))
105 .Case("Yacute", helperResolveHTMLHexCharacterReference(0x000DD))
106 .Case("aacute", helperResolveHTMLHexCharacterReference(0x000E1))
107 .Case("eacute", helperResolveHTMLHexCharacterReference(0x000E9))
108 .Case("iacute", helperResolveHTMLHexCharacterReference(0x000ED))
109 .Case("oacute", helperResolveHTMLHexCharacterReference(0x000F3))
110 .Case("uacute", helperResolveHTMLHexCharacterReference(0x000FA))
111 .Case("yacute", helperResolveHTMLHexCharacterReference(0x000FD))
112 .Case("Agrave", helperResolveHTMLHexCharacterReference(0x000C0))
113 .Case("Egrave", helperResolveHTMLHexCharacterReference(0x000C8))
114 .Case("Igrave", helperResolveHTMLHexCharacterReference(0x000CC))
115 .Case("Ograve", helperResolveHTMLHexCharacterReference(0x000D2))
116 .Case("Ugrave", helperResolveHTMLHexCharacterReference(0x000D9))
117 .Case("agrave", helperResolveHTMLHexCharacterReference(0x000E0))
118 .Case("egrave", helperResolveHTMLHexCharacterReference(0x000E8))
119 .Case("igrave", helperResolveHTMLHexCharacterReference(0x000EC))
120 .Case("ograve", helperResolveHTMLHexCharacterReference(0x000F2))
121 .Case("ugrave", helperResolveHTMLHexCharacterReference(0x000F9))
122 .Case("ygrave", helperResolveHTMLHexCharacterReference(0x01EF3))
123 .Case("Acirc", helperResolveHTMLHexCharacterReference(0x000C2))
124 .Case("Ecirc", helperResolveHTMLHexCharacterReference(0x000CA))
125 .Case("Icirc", helperResolveHTMLHexCharacterReference(0x000CE))
126 .Case("Ocirc", helperResolveHTMLHexCharacterReference(0x000D4))
127 .Case("Ucirc", helperResolveHTMLHexCharacterReference(0x000DB))
128 .Case("acirc", helperResolveHTMLHexCharacterReference(0x000E2))
129 .Case("ecirc", helperResolveHTMLHexCharacterReference(0x000EA))
130 .Case("icirc", helperResolveHTMLHexCharacterReference(0x000EE))
131 .Case("ocirc", helperResolveHTMLHexCharacterReference(0x000F4))
132 .Case("ucirc", helperResolveHTMLHexCharacterReference(0x000FB))
133 .Case("ycirc", helperResolveHTMLHexCharacterReference(0x00177))
134 .Case("Atilde", helperResolveHTMLHexCharacterReference(0x000C3))
135 .Case("Ntilde", helperResolveHTMLHexCharacterReference(0x000D1))
136 .Case("Otilde", helperResolveHTMLHexCharacterReference(0x000D5))
137 .Case("atilde", helperResolveHTMLHexCharacterReference(0x000E3))
138 .Case("ntilde", helperResolveHTMLHexCharacterReference(0x000F1))
139 .Case("otilde", helperResolveHTMLHexCharacterReference(0x000F5))
140 .Case("szlig", helperResolveHTMLHexCharacterReference(0x000DF))
141 .Case("ccedil", helperResolveHTMLHexCharacterReference(0x000E7))
142 .Case("Ccedil", helperResolveHTMLHexCharacterReference(0x000C7))
143 .Case("aring", helperResolveHTMLHexCharacterReference(0x000E5))
144 .Case("Aring", helperResolveHTMLHexCharacterReference(0x000C5))
145 .Case("nbsp", helperResolveHTMLHexCharacterReference(0x000A0))
146 .Case("Gamma", helperResolveHTMLHexCharacterReference(0x00393))
147 .Case("Delta", helperResolveHTMLHexCharacterReference(0x00394))
148 .Case("Theta", helperResolveHTMLHexCharacterReference(0x00398))
149 .Case("Lambda", helperResolveHTMLHexCharacterReference(0x0039B))
150 .Case("Xi", helperResolveHTMLHexCharacterReference(0x0039E))
151 .Case("Pi", helperResolveHTMLHexCharacterReference(0x003A0))
152 .Case("Sigma", helperResolveHTMLHexCharacterReference(0x003A3))
153 .Case("Upsilon", helperResolveHTMLHexCharacterReference(0x003A5))
154 .Case("Phi", helperResolveHTMLHexCharacterReference(0x003A6))
155 .Case("Psi", helperResolveHTMLHexCharacterReference(0x003A8))
156 .Case("Omega", helperResolveHTMLHexCharacterReference(0x003A9))
157 .Case("alpha", helperResolveHTMLHexCharacterReference(0x003B1))
158 .Case("beta", helperResolveHTMLHexCharacterReference(0x003B2))
159 .Case("gamma", helperResolveHTMLHexCharacterReference(0x003B3))
160 .Case("delta", helperResolveHTMLHexCharacterReference(0x003B4))
161 .Case("epsilon", helperResolveHTMLHexCharacterReference(0x003B5))
162 .Case("zeta", helperResolveHTMLHexCharacterReference(0x003B6))
163 .Case("eta", helperResolveHTMLHexCharacterReference(0x003B7))
164 .Case("theta", helperResolveHTMLHexCharacterReference(0x003B8))
165 .Case("iota", helperResolveHTMLHexCharacterReference(0x003B9))
166 .Case("kappa", helperResolveHTMLHexCharacterReference(0x003BA))
167 .Case("lambda", helperResolveHTMLHexCharacterReference(0x003BB))
168 .Case("mu", helperResolveHTMLHexCharacterReference(0x003BC))
169 .Case("nu", helperResolveHTMLHexCharacterReference(0x003BD))
170 .Case("xi", helperResolveHTMLHexCharacterReference(0x003BE))
171 .Case("pi", helperResolveHTMLHexCharacterReference(0x003C0))
172 .Case("rho", helperResolveHTMLHexCharacterReference(0x003C1))
173 .Case("sigma", helperResolveHTMLHexCharacterReference(0x003C3))
174 .Case("tau", helperResolveHTMLHexCharacterReference(0x003C4))
175 .Case("upsilon", helperResolveHTMLHexCharacterReference(0x003C5))
176 .Case("phi", helperResolveHTMLHexCharacterReference(0x003C6))
177 .Case("chi", helperResolveHTMLHexCharacterReference(0x003C7))
178 .Case("psi", helperResolveHTMLHexCharacterReference(0x003C8))
179 .Case("omega", helperResolveHTMLHexCharacterReference(0x003C9))
180 .Case("sigmaf", helperResolveHTMLHexCharacterReference(0x003C2))
181 .Case("sect", helperResolveHTMLHexCharacterReference(0x000A7))
182 .Case("deg", helperResolveHTMLHexCharacterReference(0x000B0))
183 .Case("prime", helperResolveHTMLHexCharacterReference(0x02032))
184 .Case("Prime", helperResolveHTMLHexCharacterReference(0x02033))
185 .Case("infin", helperResolveHTMLHexCharacterReference(0x0221E))
186 .Case("empty", helperResolveHTMLHexCharacterReference(0x02205))
187 .Case("plusmn", helperResolveHTMLHexCharacterReference(0x000B1))
188 .Case("times", helperResolveHTMLHexCharacterReference(0x000D7))
189 .Case("minus", helperResolveHTMLHexCharacterReference(0x02212))
190 .Case("sdot", helperResolveHTMLHexCharacterReference(0x022C5))
191 .Case("part", helperResolveHTMLHexCharacterReference(0x02202))
192 .Case("nabla", helperResolveHTMLHexCharacterReference(0x02207))
193 .Case("radic", helperResolveHTMLHexCharacterReference(0x0221A))
194 .Case("perp", helperResolveHTMLHexCharacterReference(0x022A5))
195 .Case("sum", helperResolveHTMLHexCharacterReference(0x02211))
196 .Case("int", helperResolveHTMLHexCharacterReference(0x0222B))
197 .Case("prod", helperResolveHTMLHexCharacterReference(0x0220F))
198 .Case("sim", helperResolveHTMLHexCharacterReference(0x0223C))
199 .Case("asymp", helperResolveHTMLHexCharacterReference(0x02248))
200 .Case("ne", helperResolveHTMLHexCharacterReference(0x02260))
201 .Case("equiv", helperResolveHTMLHexCharacterReference(0x02261))
202 .Case("prop", helperResolveHTMLHexCharacterReference(0x0221D))
203 .Case("le", helperResolveHTMLHexCharacterReference(0x02264))
204 .Case("ge", helperResolveHTMLHexCharacterReference(0x02265))
205 .Case("larr", helperResolveHTMLHexCharacterReference(0x02190))
206 .Case("rarr", helperResolveHTMLHexCharacterReference(0x02192))
207 .Case("isin", helperResolveHTMLHexCharacterReference(0x02208))
208 .Case("notin", helperResolveHTMLHexCharacterReference(0x02209))
209 .Case("lceil", helperResolveHTMLHexCharacterReference(0x02308))
210 .Case("rceil", helperResolveHTMLHexCharacterReference(0x02309))
211 .Case("lfloor", helperResolveHTMLHexCharacterReference(0x0230A))
212 .Case("rfloor", helperResolveHTMLHexCharacterReference(0x0230B))
213 .Default("");
214}
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000215
216StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
217 unsigned CodePoint = 0;
218 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
219 assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
220 CodePoint *= 10;
221 CodePoint += Name[i] - '0';
222 }
223
224 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
225 char *ResolvedPtr = Resolved;
Dmitri Gribenkocb5620c2013-01-30 12:06:08 +0000226 if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000227 return StringRef(Resolved, ResolvedPtr - Resolved);
228 else
229 return StringRef();
230}
231
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000232void Lexer::skipLineStartingDecorations() {
233 // This function should be called only for C comments
234 assert(CommentState == LCS_InsideCComment);
235
236 if (BufferPtr == CommentEnd)
237 return;
238
239 switch (*BufferPtr) {
240 case ' ':
241 case '\t':
242 case '\f':
243 case '\v': {
244 const char *NewBufferPtr = BufferPtr;
245 NewBufferPtr++;
246 if (NewBufferPtr == CommentEnd)
247 return;
248
249 char C = *NewBufferPtr;
250 while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
251 NewBufferPtr++;
252 if (NewBufferPtr == CommentEnd)
253 return;
254 C = *NewBufferPtr;
255 }
256 if (C == '*')
257 BufferPtr = NewBufferPtr + 1;
258 break;
259 }
260 case '*':
261 BufferPtr++;
262 break;
263 }
264}
265
266namespace {
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000267/// Returns pointer to the first newline character in the string.
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000268const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
269 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
270 const char C = *BufferPtr;
271 if (C == '\n' || C == '\r')
272 return BufferPtr;
273 }
274 return BufferEnd;
275}
276
277const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
278 if (BufferPtr == BufferEnd)
279 return BufferPtr;
280
281 if (*BufferPtr == '\n')
282 BufferPtr++;
283 else {
284 assert(*BufferPtr == '\r');
285 BufferPtr++;
286 if (BufferPtr != BufferEnd && *BufferPtr == '\n')
287 BufferPtr++;
288 }
289 return BufferPtr;
290}
291
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000292const char *skipNamedCharacterReference(const char *BufferPtr,
293 const char *BufferEnd) {
294 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
295 if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
296 return BufferPtr;
297 }
298 return BufferEnd;
299}
300
301const char *skipDecimalCharacterReference(const char *BufferPtr,
302 const char *BufferEnd) {
303 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
304 if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
305 return BufferPtr;
306 }
307 return BufferEnd;
308}
309
310const char *skipHexCharacterReference(const char *BufferPtr,
311 const char *BufferEnd) {
312 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
313 if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
314 return BufferPtr;
315 }
316 return BufferEnd;
317}
318
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000319bool isHTMLIdentifierStartingCharacter(char C) {
320 return (C >= 'a' && C <= 'z') ||
321 (C >= 'A' && C <= 'Z');
322}
323
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000324bool isHTMLIdentifierCharacter(char C) {
325 return (C >= 'a' && C <= 'z') ||
326 (C >= 'A' && C <= 'Z') ||
327 (C >= '0' && C <= '9');
328}
329
330const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
331 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
332 if (!isHTMLIdentifierCharacter(*BufferPtr))
333 return BufferPtr;
334 }
335 return BufferEnd;
336}
337
338/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
339/// string allowed.
340///
341/// Returns pointer to closing quote.
342const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
343{
344 const char Quote = *BufferPtr;
345 assert(Quote == '\"' || Quote == '\'');
346
347 BufferPtr++;
348 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
349 const char C = *BufferPtr;
350 if (C == Quote && BufferPtr[-1] != '\\')
351 return BufferPtr;
352 }
353 return BufferEnd;
354}
355
356bool isHorizontalWhitespace(char C) {
357 return C == ' ' || C == '\t' || C == '\f' || C == '\v';
358}
359
360bool isWhitespace(char C) {
361 return C == ' ' || C == '\n' || C == '\r' ||
362 C == '\t' || C == '\f' || C == '\v';
363}
364
365const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
366 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
367 if (!isWhitespace(*BufferPtr))
368 return BufferPtr;
369 }
370 return BufferEnd;
371}
372
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000373bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
374 return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
375}
376
Dmitri Gribenko8c05da32012-09-14 16:35:35 +0000377bool isCommandNameStartCharacter(char C) {
378 return (C >= 'a' && C <= 'z') ||
379 (C >= 'A' && C <= 'Z');
380}
381
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000382bool isCommandNameCharacter(char C) {
383 return (C >= 'a' && C <= 'z') ||
384 (C >= 'A' && C <= 'Z') ||
385 (C >= '0' && C <= '9');
386}
387
388const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
389 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
390 if (!isCommandNameCharacter(*BufferPtr))
391 return BufferPtr;
392 }
393 return BufferEnd;
394}
395
396/// Return the one past end pointer for BCPL comments.
397/// Handles newlines escaped with backslash or trigraph for backslahs.
398const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
399 const char *CurPtr = BufferPtr;
400 while (CurPtr != BufferEnd) {
401 char C = *CurPtr;
402 while (C != '\n' && C != '\r') {
403 CurPtr++;
404 if (CurPtr == BufferEnd)
405 return BufferEnd;
406 C = *CurPtr;
407 }
408 // We found a newline, check if it is escaped.
409 const char *EscapePtr = CurPtr - 1;
410 while(isHorizontalWhitespace(*EscapePtr))
411 EscapePtr--;
412
413 if (*EscapePtr == '\\' ||
414 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
415 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
416 // We found an escaped newline.
417 CurPtr = skipNewline(CurPtr, BufferEnd);
418 } else
419 return CurPtr; // Not an escaped newline.
420 }
421 return BufferEnd;
422}
423
424/// Return the one past end pointer for C comments.
425/// Very dumb, does not handle escaped newlines or trigraphs.
426const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
427 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
428 if (*BufferPtr == '*') {
429 assert(BufferPtr + 1 != BufferEnd);
430 if (*(BufferPtr + 1) == '/')
431 return BufferPtr;
432 }
433 }
434 llvm_unreachable("buffer end hit before '*/' was seen");
435}
436} // unnamed namespace
437
438void Lexer::lexCommentText(Token &T) {
439 assert(CommentState == LCS_InsideBCPLComment ||
440 CommentState == LCS_InsideCComment);
441
442 switch (State) {
443 case LS_Normal:
444 break;
445 case LS_VerbatimBlockFirstLine:
446 lexVerbatimBlockFirstLine(T);
447 return;
448 case LS_VerbatimBlockBody:
449 lexVerbatimBlockBody(T);
450 return;
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000451 case LS_VerbatimLineText:
452 lexVerbatimLineText(T);
453 return;
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000454 case LS_HTMLStartTag:
455 lexHTMLStartTag(T);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000456 return;
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000457 case LS_HTMLEndTag:
458 lexHTMLEndTag(T);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000459 return;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000460 }
461
462 assert(State == LS_Normal);
463
464 const char *TokenPtr = BufferPtr;
465 assert(TokenPtr < CommentEnd);
466 while (TokenPtr != CommentEnd) {
467 switch(*TokenPtr) {
468 case '\\':
469 case '@': {
470 TokenPtr++;
471 if (TokenPtr == CommentEnd) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000472 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000473 return;
474 }
475 char C = *TokenPtr;
476 switch (C) {
477 default:
478 break;
479
480 case '\\': case '@': case '&': case '$':
481 case '#': case '<': case '>': case '%':
482 case '\"': case '.': case ':':
483 // This is one of \\ \@ \& \$ etc escape sequences.
484 TokenPtr++;
485 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
486 // This is the \:: escape sequence.
487 TokenPtr++;
488 }
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000489 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000490 formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000491 T.setText(UnescapedText);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000492 return;
493 }
494
495 // Don't make zero-length commands.
Dmitri Gribenko8c05da32012-09-14 16:35:35 +0000496 if (!isCommandNameStartCharacter(*TokenPtr)) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000497 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000498 return;
499 }
500
501 TokenPtr = skipCommandName(TokenPtr, CommentEnd);
502 unsigned Length = TokenPtr - (BufferPtr + 1);
503
504 // Hardcoded support for lexing LaTeX formula commands
505 // \f$ \f[ \f] \f{ \f} as a single command.
506 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
507 C = *TokenPtr;
508 if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
509 TokenPtr++;
510 Length++;
511 }
512 }
513
514 const StringRef CommandName(BufferPtr + 1, Length);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000515
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000516 const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
517 if (!Info) {
518 formTokenWithChars(T, TokenPtr, tok::unknown_command);
519 T.setUnknownCommandName(CommandName);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000520 return;
521 }
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000522 if (Info->IsVerbatimBlockCommand) {
523 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
524 return;
525 }
526 if (Info->IsVerbatimLineCommand) {
527 setupAndLexVerbatimLine(T, TokenPtr, Info);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000528 return;
529 }
530 formTokenWithChars(T, TokenPtr, tok::command);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000531 T.setCommandID(Info->getID());
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000532 return;
533 }
534
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000535 case '&':
536 lexHTMLCharacterReference(T);
537 return;
538
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000539 case '<': {
540 TokenPtr++;
541 if (TokenPtr == CommentEnd) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000542 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000543 return;
544 }
545 const char C = *TokenPtr;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000546 if (isHTMLIdentifierStartingCharacter(C))
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000547 setupAndLexHTMLStartTag(T);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000548 else if (C == '/')
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000549 setupAndLexHTMLEndTag(T);
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000550 else
551 formTextToken(T, TokenPtr);
552
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000553 return;
554 }
555
556 case '\n':
557 case '\r':
558 TokenPtr = skipNewline(TokenPtr, CommentEnd);
559 formTokenWithChars(T, TokenPtr, tok::newline);
560
561 if (CommentState == LCS_InsideCComment)
562 skipLineStartingDecorations();
563 return;
564
565 default: {
Dmitri Gribenkoaa7dbaf2012-12-30 19:45:46 +0000566 size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
567 find_first_of("\n\r\\@&<");
568 if (End != StringRef::npos)
569 TokenPtr += End;
570 else
571 TokenPtr = CommentEnd;
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000572 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000573 return;
574 }
575 }
576 }
577}
578
579void Lexer::setupAndLexVerbatimBlock(Token &T,
580 const char *TextBegin,
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000581 char Marker, const CommandInfo *Info) {
582 assert(Info->IsVerbatimBlockCommand);
583
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000584 VerbatimBlockEndCommandName.clear();
585 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000586 VerbatimBlockEndCommandName.append(Info->EndCommandName);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000587
588 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000589 T.setVerbatimBlockID(Info->getID());
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000590
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000591 // If there is a newline following the verbatim opening command, skip the
592 // newline so that we don't create an tok::verbatim_block_line with empty
593 // text content.
594 if (BufferPtr != CommentEnd) {
595 const char C = *BufferPtr;
596 if (C == '\n' || C == '\r') {
597 BufferPtr = skipNewline(BufferPtr, CommentEnd);
598 State = LS_VerbatimBlockBody;
599 return;
600 }
601 }
602
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000603 State = LS_VerbatimBlockFirstLine;
604}
605
606void Lexer::lexVerbatimBlockFirstLine(Token &T) {
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000607again:
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000608 assert(BufferPtr < CommentEnd);
609
610 // FIXME: It would be better to scan the text once, finding either the block
611 // end command or newline.
612 //
613 // Extract current line.
614 const char *Newline = findNewline(BufferPtr, CommentEnd);
615 StringRef Line(BufferPtr, Newline - BufferPtr);
616
617 // Look for end command in current line.
618 size_t Pos = Line.find(VerbatimBlockEndCommandName);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000619 const char *TextEnd;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000620 const char *NextLine;
621 if (Pos == StringRef::npos) {
622 // Current line is completely verbatim.
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000623 TextEnd = Newline;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000624 NextLine = skipNewline(Newline, CommentEnd);
625 } else if (Pos == 0) {
626 // Current line contains just an end command.
627 const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000628 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000629 formTokenWithChars(T, End, tok::verbatim_block_end);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000630 T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000631 State = LS_Normal;
632 return;
633 } else {
634 // There is some text, followed by end command. Extract text first.
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000635 TextEnd = BufferPtr + Pos;
636 NextLine = TextEnd;
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000637 // If there is only whitespace before end command, skip whitespace.
638 if (isWhitespace(BufferPtr, TextEnd)) {
639 BufferPtr = TextEnd;
640 goto again;
641 }
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000642 }
643
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000644 StringRef Text(BufferPtr, TextEnd - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000645 formTokenWithChars(T, NextLine, tok::verbatim_block_line);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000646 T.setVerbatimBlockText(Text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000647
648 State = LS_VerbatimBlockBody;
649}
650
651void Lexer::lexVerbatimBlockBody(Token &T) {
652 assert(State == LS_VerbatimBlockBody);
653
654 if (CommentState == LCS_InsideCComment)
655 skipLineStartingDecorations();
656
657 lexVerbatimBlockFirstLine(T);
658}
659
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000660void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
661 const CommandInfo *Info) {
662 assert(Info->IsVerbatimLineCommand);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000663 formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000664 T.setVerbatimLineID(Info->getID());
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000665
666 State = LS_VerbatimLineText;
667}
668
669void Lexer::lexVerbatimLineText(Token &T) {
670 assert(State == LS_VerbatimLineText);
671
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000672 // Extract current line.
673 const char *Newline = findNewline(BufferPtr, CommentEnd);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000674 const StringRef Text(BufferPtr, Newline - BufferPtr);
675 formTokenWithChars(T, Newline, tok::verbatim_line_text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000676 T.setVerbatimLineText(Text);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000677
678 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000679}
680
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000681void Lexer::lexHTMLCharacterReference(Token &T) {
682 const char *TokenPtr = BufferPtr;
683 assert(*TokenPtr == '&');
684 TokenPtr++;
685 if (TokenPtr == CommentEnd) {
686 formTextToken(T, TokenPtr);
687 return;
688 }
689 const char *NamePtr;
690 bool isNamed = false;
691 bool isDecimal = false;
692 char C = *TokenPtr;
693 if (isHTMLNamedCharacterReferenceCharacter(C)) {
694 NamePtr = TokenPtr;
695 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
696 isNamed = true;
697 } else if (C == '#') {
698 TokenPtr++;
699 if (TokenPtr == CommentEnd) {
700 formTextToken(T, TokenPtr);
701 return;
702 }
703 C = *TokenPtr;
704 if (isHTMLDecimalCharacterReferenceCharacter(C)) {
705 NamePtr = TokenPtr;
706 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
707 isDecimal = true;
708 } else if (C == 'x' || C == 'X') {
709 TokenPtr++;
710 NamePtr = TokenPtr;
711 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
712 } else {
713 formTextToken(T, TokenPtr);
714 return;
715 }
716 } else {
717 formTextToken(T, TokenPtr);
718 return;
719 }
720 if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
721 *TokenPtr != ';') {
722 formTextToken(T, TokenPtr);
723 return;
724 }
725 StringRef Name(NamePtr, TokenPtr - NamePtr);
726 TokenPtr++; // Skip semicolon.
727 StringRef Resolved;
Fariborz Jahanian658a1152013-01-29 23:42:26 +0000728 if (isNamed) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000729 Resolved = resolveHTMLNamedCharacterReference(Name);
Fariborz Jahanian658a1152013-01-29 23:42:26 +0000730 if (Resolved.empty()) {
731 Resolved = HTMLDoxygenCharacterReference(Name);
732 if (!Resolved.empty()) {
733 formTokenWithChars(T, TokenPtr, tok::text);
734 T.setText(Resolved);
735 return;
736 }
737 }
738 }
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000739 else if (isDecimal)
740 Resolved = resolveHTMLDecimalCharacterReference(Name);
741 else
742 Resolved = resolveHTMLHexCharacterReference(Name);
743
744 if (Resolved.empty()) {
745 formTextToken(T, TokenPtr);
746 return;
747 }
748 formTokenWithChars(T, TokenPtr, tok::text);
749 T.setText(Resolved);
750 return;
751}
752
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000753void Lexer::setupAndLexHTMLStartTag(Token &T) {
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000754 assert(BufferPtr[0] == '<' &&
755 isHTMLIdentifierStartingCharacter(BufferPtr[1]));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000756 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000757 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000758 if (!isHTMLTagName(Name)) {
759 formTextToken(T, TagNameEnd);
760 return;
761 }
762
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000763 formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
764 T.setHTMLTagStartName(Name);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000765
766 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
767
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000768 const char C = *BufferPtr;
769 if (BufferPtr != CommentEnd &&
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000770 (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000771 State = LS_HTMLStartTag;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000772}
773
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000774void Lexer::lexHTMLStartTag(Token &T) {
775 assert(State == LS_HTMLStartTag);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000776
777 const char *TokenPtr = BufferPtr;
778 char C = *TokenPtr;
779 if (isHTMLIdentifierCharacter(C)) {
780 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000781 StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000782 formTokenWithChars(T, TokenPtr, tok::html_ident);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000783 T.setHTMLIdent(Ident);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000784 } else {
785 switch (C) {
786 case '=':
787 TokenPtr++;
788 formTokenWithChars(T, TokenPtr, tok::html_equals);
789 break;
790 case '\"':
791 case '\'': {
792 const char *OpenQuote = TokenPtr;
793 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
794 const char *ClosingQuote = TokenPtr;
795 if (TokenPtr != CommentEnd) // Skip closing quote.
796 TokenPtr++;
797 formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
798 T.setHTMLQuotedString(StringRef(OpenQuote + 1,
799 ClosingQuote - (OpenQuote + 1)));
800 break;
801 }
802 case '>':
803 TokenPtr++;
804 formTokenWithChars(T, TokenPtr, tok::html_greater);
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000805 State = LS_Normal;
806 return;
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000807 case '/':
808 TokenPtr++;
809 if (TokenPtr != CommentEnd && *TokenPtr == '>') {
810 TokenPtr++;
811 formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000812 } else
813 formTextToken(T, TokenPtr);
814
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000815 State = LS_Normal;
816 return;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000817 }
818 }
819
820 // Now look ahead and return to normal state if we don't see any HTML tokens
821 // ahead.
822 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
823 if (BufferPtr == CommentEnd) {
824 State = LS_Normal;
825 return;
826 }
827
828 C = *BufferPtr;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000829 if (!isHTMLIdentifierStartingCharacter(C) &&
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000830 C != '=' && C != '\"' && C != '\'' && C != '>') {
831 State = LS_Normal;
832 return;
833 }
834}
835
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000836void Lexer::setupAndLexHTMLEndTag(Token &T) {
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000837 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
838
839 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
840 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000841 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
842 if (!isHTMLTagName(Name)) {
843 formTextToken(T, TagNameEnd);
844 return;
845 }
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000846
847 const char *End = skipWhitespace(TagNameEnd, CommentEnd);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000848
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000849 formTokenWithChars(T, End, tok::html_end_tag);
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000850 T.setHTMLTagEndName(Name);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000851
852 if (BufferPtr != CommentEnd && *BufferPtr == '>')
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000853 State = LS_HTMLEndTag;
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000854}
855
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000856void Lexer::lexHTMLEndTag(Token &T) {
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000857 assert(BufferPtr != CommentEnd && *BufferPtr == '>');
858
859 formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
860 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000861}
862
Dmitri Gribenkoaa580812012-08-09 00:03:17 +0000863Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
Dmitri Gribenkoaf503a62012-08-31 10:35:30 +0000864 SourceLocation FileLoc,
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000865 const char *BufferStart, const char *BufferEnd):
Dmitri Gribenkoaa580812012-08-09 00:03:17 +0000866 Allocator(Allocator), Traits(Traits),
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000867 BufferStart(BufferStart), BufferEnd(BufferEnd),
Dmitri Gribenkoaf503a62012-08-31 10:35:30 +0000868 FileLoc(FileLoc), BufferPtr(BufferStart),
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000869 CommentState(LCS_BeforeComment), State(LS_Normal) {
870}
871
872void Lexer::lex(Token &T) {
873again:
874 switch (CommentState) {
875 case LCS_BeforeComment:
876 if (BufferPtr == BufferEnd) {
877 formTokenWithChars(T, BufferPtr, tok::eof);
878 return;
879 }
880
881 assert(*BufferPtr == '/');
882 BufferPtr++; // Skip first slash.
883 switch(*BufferPtr) {
884 case '/': { // BCPL comment.
885 BufferPtr++; // Skip second slash.
886
887 if (BufferPtr != BufferEnd) {
888 // Skip Doxygen magic marker, if it is present.
889 // It might be missing because of a typo //< or /*<, or because we
890 // merged this non-Doxygen comment into a bunch of Doxygen comments
891 // around it: /** ... */ /* ... */ /** ... */
892 const char C = *BufferPtr;
893 if (C == '/' || C == '!')
894 BufferPtr++;
895 }
896
897 // Skip less-than symbol that marks trailing comments.
898 // Skip it even if the comment is not a Doxygen one, because //< and /*<
899 // are frequent typos.
900 if (BufferPtr != BufferEnd && *BufferPtr == '<')
901 BufferPtr++;
902
903 CommentState = LCS_InsideBCPLComment;
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000904 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
905 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000906 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
907 goto again;
908 }
909 case '*': { // C comment.
910 BufferPtr++; // Skip star.
911
912 // Skip Doxygen magic marker.
913 const char C = *BufferPtr;
914 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
915 BufferPtr++;
916
917 // Skip less-than symbol that marks trailing comments.
918 if (BufferPtr != BufferEnd && *BufferPtr == '<')
919 BufferPtr++;
920
921 CommentState = LCS_InsideCComment;
922 State = LS_Normal;
923 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
924 goto again;
925 }
926 default:
927 llvm_unreachable("second character of comment should be '/' or '*'");
928 }
929
930 case LCS_BetweenComments: {
931 // Consecutive comments are extracted only if there is only whitespace
932 // between them. So we can search for the start of the next comment.
933 const char *EndWhitespace = BufferPtr;
934 while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
935 EndWhitespace++;
936
937 // Turn any whitespace between comments (and there is only whitespace
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000938 // between them -- guaranteed by comment extraction) into a newline. We
939 // have two newlines between C comments in total (first one was synthesized
940 // after a comment).
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000941 formTokenWithChars(T, EndWhitespace, tok::newline);
942
943 CommentState = LCS_BeforeComment;
944 break;
945 }
946
947 case LCS_InsideBCPLComment:
948 case LCS_InsideCComment:
949 if (BufferPtr != CommentEnd) {
950 lexCommentText(T);
951 break;
952 } else {
953 // Skip C comment closing sequence.
954 if (CommentState == LCS_InsideCComment) {
955 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
956 BufferPtr += 2;
957 assert(BufferPtr <= BufferEnd);
958
959 // Synthenize newline just after the C comment, regardless if there is
960 // actually a newline.
961 formTokenWithChars(T, BufferPtr, tok::newline);
962
963 CommentState = LCS_BetweenComments;
964 break;
965 } else {
966 // Don't synthesized a newline after BCPL comment.
967 CommentState = LCS_BetweenComments;
968 goto again;
969 }
970 }
971 }
972}
973
974StringRef Lexer::getSpelling(const Token &Tok,
975 const SourceManager &SourceMgr,
976 bool *Invalid) const {
977 SourceLocation Loc = Tok.getLocation();
978 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
979
980 bool InvalidTemp = false;
981 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
982 if (InvalidTemp) {
983 *Invalid = true;
984 return StringRef();
985 }
986
987 const char *Begin = File.data() + LocInfo.second;
988 return StringRef(Begin, Tok.getLength());
989}
990
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000991} // end namespace comments
992} // end namespace clang
993