blob: f1c23c6103e4ad5397375598f1eead10ddb34053 [file] [log] [blame]
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00001#include "clang/AST/CommentLexer.h"
Dmitri Gribenkoaa580812012-08-09 00:03:17 +00002#include "clang/AST/CommentCommandTraits.h"
Dmitri Gribenko477a9f52012-07-27 20:37:06 +00003#include "clang/Basic/ConvertUTF.h"
Dmitri Gribenkoc934dfe2013-01-19 22:06:05 +00004#include "llvm/ADT/StringExtras.h"
Dmitri Gribenko2d44d772012-06-26 20:39:18 +00005#include "llvm/ADT/StringSwitch.h"
6#include "llvm/Support/ErrorHandling.h"
7
8namespace clang {
9namespace comments {
10
11void Token::dump(const Lexer &L, const SourceManager &SM) const {
12 llvm::errs() << "comments::Token Kind=" << Kind << " ";
13 Loc.dump(SM);
14 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
15}
16
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000017namespace {
18bool isHTMLNamedCharacterReferenceCharacter(char C) {
19 return (C >= 'a' && C <= 'z') ||
20 (C >= 'A' && C <= 'Z');
21}
22
23bool isHTMLDecimalCharacterReferenceCharacter(char C) {
24 return C >= '0' && C <= '9';
25}
26
27bool isHTMLHexCharacterReferenceCharacter(char C) {
28 return (C >= '0' && C <= '9') ||
29 (C >= 'a' && C <= 'f') ||
30 (C >= 'A' && C <= 'F');
31}
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +000032
Dmitri Gribenkoc24a76e2012-08-31 02:21:44 +000033#include "clang/AST/CommentHTMLTags.inc"
34
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000035} // unnamed namespace
36
Fariborz Jahanian658a1152013-01-29 23:42:26 +000037static unsigned getCodePoint(StringRef Name) {
38 unsigned CodePoint = 0;
39 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
40 CodePoint *= 16;
41 const char C = Name[i];
42 assert(isHTMLHexCharacterReferenceCharacter(C));
43 CodePoint += llvm::hexDigitValue(C);
44 }
45 return CodePoint;
46}
47
48StringRef Lexer::helperResolveHTMLHexCharacterReference(unsigned CodePoint) const {
49 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
50 char *ResolvedPtr = Resolved;
51 if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
52 return StringRef(Resolved, ResolvedPtr - Resolved);
53 else
54 return StringRef();
55}
56
57StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
58 unsigned CodePoint = getCodePoint(Name);
59 return helperResolveHTMLHexCharacterReference(CodePoint);
60}
61
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000062StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
63 return llvm::StringSwitch<StringRef>(Name)
64 .Case("amp", "&")
65 .Case("lt", "<")
66 .Case("gt", ">")
67 .Case("quot", "\"")
68 .Case("apos", "\'")
Fariborz Jahanian658a1152013-01-29 23:42:26 +000069 .Case("minus", "-")
70 .Case("sim", "~")
Dmitri Gribenko477a9f52012-07-27 20:37:06 +000071 .Default("");
72}
Fariborz Jahanian658a1152013-01-29 23:42:26 +000073
74StringRef Lexer::HTMLDoxygenCharacterReference(StringRef Name) const {
75 return llvm::StringSwitch<StringRef>(Name)
76 .Case("copy", helperResolveHTMLHexCharacterReference(0x000A9))
77 .Case("trade", helperResolveHTMLHexCharacterReference(0x02122))
78 .Case("reg", helperResolveHTMLHexCharacterReference(0x000AE))
79 .Case("lt", helperResolveHTMLHexCharacterReference(0x0003C))
80 .Case("gt", helperResolveHTMLHexCharacterReference(0x0003C))
81 .Case("amp", helperResolveHTMLHexCharacterReference(0x00026))
82 .Case("apos", helperResolveHTMLHexCharacterReference(0x00027))
83 .Case("quot", helperResolveHTMLHexCharacterReference(0x00022))
84 .Case("lsquo", helperResolveHTMLHexCharacterReference(0x02018))
85 .Case("rsquo", helperResolveHTMLHexCharacterReference(0x02019))
86 .Case("ldquo", helperResolveHTMLHexCharacterReference(0x0201C))
87 .Case("rdquo", helperResolveHTMLHexCharacterReference(0x0201D))
88 .Case("ndash", helperResolveHTMLHexCharacterReference(0x02013))
89 .Case("mdash", helperResolveHTMLHexCharacterReference(0x02014))
90 .Case("Auml", helperResolveHTMLHexCharacterReference(0x000C4))
91 .Case("Euml", helperResolveHTMLHexCharacterReference(0x000CB))
92 .Case("Iuml", helperResolveHTMLHexCharacterReference(0x000CF))
93 .Case("Ouml", helperResolveHTMLHexCharacterReference(0x000D6))
94 .Case("Uuml", helperResolveHTMLHexCharacterReference(0x000DC))
95 .Case("Yuml", helperResolveHTMLHexCharacterReference(0x00178))
96 .Case("auml", helperResolveHTMLHexCharacterReference(0x000E4))
97 .Case("euml", helperResolveHTMLHexCharacterReference(0x000EB))
98 .Case("iuml", helperResolveHTMLHexCharacterReference(0x000EF))
99 .Case("ouml", helperResolveHTMLHexCharacterReference(0x000F6))
100 .Case("uuml", helperResolveHTMLHexCharacterReference(0x000FC))
101 .Case("yuml", helperResolveHTMLHexCharacterReference(0x000FF))
102 .Case("Aacute", helperResolveHTMLHexCharacterReference(0x000C1))
103 .Case("Eacute", helperResolveHTMLHexCharacterReference(0x000C9))
104 .Case("Iacute", helperResolveHTMLHexCharacterReference(0x000CD))
105 .Case("Oacute", helperResolveHTMLHexCharacterReference(0x000D3))
106 .Case("Uacute", helperResolveHTMLHexCharacterReference(0x000DA))
107 .Case("Yacute", helperResolveHTMLHexCharacterReference(0x000DD))
108 .Case("aacute", helperResolveHTMLHexCharacterReference(0x000E1))
109 .Case("eacute", helperResolveHTMLHexCharacterReference(0x000E9))
110 .Case("iacute", helperResolveHTMLHexCharacterReference(0x000ED))
111 .Case("oacute", helperResolveHTMLHexCharacterReference(0x000F3))
112 .Case("uacute", helperResolveHTMLHexCharacterReference(0x000FA))
113 .Case("yacute", helperResolveHTMLHexCharacterReference(0x000FD))
114 .Case("Agrave", helperResolveHTMLHexCharacterReference(0x000C0))
115 .Case("Egrave", helperResolveHTMLHexCharacterReference(0x000C8))
116 .Case("Igrave", helperResolveHTMLHexCharacterReference(0x000CC))
117 .Case("Ograve", helperResolveHTMLHexCharacterReference(0x000D2))
118 .Case("Ugrave", helperResolveHTMLHexCharacterReference(0x000D9))
119 .Case("agrave", helperResolveHTMLHexCharacterReference(0x000E0))
120 .Case("egrave", helperResolveHTMLHexCharacterReference(0x000E8))
121 .Case("igrave", helperResolveHTMLHexCharacterReference(0x000EC))
122 .Case("ograve", helperResolveHTMLHexCharacterReference(0x000F2))
123 .Case("ugrave", helperResolveHTMLHexCharacterReference(0x000F9))
124 .Case("ygrave", helperResolveHTMLHexCharacterReference(0x01EF3))
125 .Case("Acirc", helperResolveHTMLHexCharacterReference(0x000C2))
126 .Case("Ecirc", helperResolveHTMLHexCharacterReference(0x000CA))
127 .Case("Icirc", helperResolveHTMLHexCharacterReference(0x000CE))
128 .Case("Ocirc", helperResolveHTMLHexCharacterReference(0x000D4))
129 .Case("Ucirc", helperResolveHTMLHexCharacterReference(0x000DB))
130 .Case("acirc", helperResolveHTMLHexCharacterReference(0x000E2))
131 .Case("ecirc", helperResolveHTMLHexCharacterReference(0x000EA))
132 .Case("icirc", helperResolveHTMLHexCharacterReference(0x000EE))
133 .Case("ocirc", helperResolveHTMLHexCharacterReference(0x000F4))
134 .Case("ucirc", helperResolveHTMLHexCharacterReference(0x000FB))
135 .Case("ycirc", helperResolveHTMLHexCharacterReference(0x00177))
136 .Case("Atilde", helperResolveHTMLHexCharacterReference(0x000C3))
137 .Case("Ntilde", helperResolveHTMLHexCharacterReference(0x000D1))
138 .Case("Otilde", helperResolveHTMLHexCharacterReference(0x000D5))
139 .Case("atilde", helperResolveHTMLHexCharacterReference(0x000E3))
140 .Case("ntilde", helperResolveHTMLHexCharacterReference(0x000F1))
141 .Case("otilde", helperResolveHTMLHexCharacterReference(0x000F5))
142 .Case("szlig", helperResolveHTMLHexCharacterReference(0x000DF))
143 .Case("ccedil", helperResolveHTMLHexCharacterReference(0x000E7))
144 .Case("Ccedil", helperResolveHTMLHexCharacterReference(0x000C7))
145 .Case("aring", helperResolveHTMLHexCharacterReference(0x000E5))
146 .Case("Aring", helperResolveHTMLHexCharacterReference(0x000C5))
147 .Case("nbsp", helperResolveHTMLHexCharacterReference(0x000A0))
148 .Case("Gamma", helperResolveHTMLHexCharacterReference(0x00393))
149 .Case("Delta", helperResolveHTMLHexCharacterReference(0x00394))
150 .Case("Theta", helperResolveHTMLHexCharacterReference(0x00398))
151 .Case("Lambda", helperResolveHTMLHexCharacterReference(0x0039B))
152 .Case("Xi", helperResolveHTMLHexCharacterReference(0x0039E))
153 .Case("Pi", helperResolveHTMLHexCharacterReference(0x003A0))
154 .Case("Sigma", helperResolveHTMLHexCharacterReference(0x003A3))
155 .Case("Upsilon", helperResolveHTMLHexCharacterReference(0x003A5))
156 .Case("Phi", helperResolveHTMLHexCharacterReference(0x003A6))
157 .Case("Psi", helperResolveHTMLHexCharacterReference(0x003A8))
158 .Case("Omega", helperResolveHTMLHexCharacterReference(0x003A9))
159 .Case("alpha", helperResolveHTMLHexCharacterReference(0x003B1))
160 .Case("beta", helperResolveHTMLHexCharacterReference(0x003B2))
161 .Case("gamma", helperResolveHTMLHexCharacterReference(0x003B3))
162 .Case("delta", helperResolveHTMLHexCharacterReference(0x003B4))
163 .Case("epsilon", helperResolveHTMLHexCharacterReference(0x003B5))
164 .Case("zeta", helperResolveHTMLHexCharacterReference(0x003B6))
165 .Case("eta", helperResolveHTMLHexCharacterReference(0x003B7))
166 .Case("theta", helperResolveHTMLHexCharacterReference(0x003B8))
167 .Case("iota", helperResolveHTMLHexCharacterReference(0x003B9))
168 .Case("kappa", helperResolveHTMLHexCharacterReference(0x003BA))
169 .Case("lambda", helperResolveHTMLHexCharacterReference(0x003BB))
170 .Case("mu", helperResolveHTMLHexCharacterReference(0x003BC))
171 .Case("nu", helperResolveHTMLHexCharacterReference(0x003BD))
172 .Case("xi", helperResolveHTMLHexCharacterReference(0x003BE))
173 .Case("pi", helperResolveHTMLHexCharacterReference(0x003C0))
174 .Case("rho", helperResolveHTMLHexCharacterReference(0x003C1))
175 .Case("sigma", helperResolveHTMLHexCharacterReference(0x003C3))
176 .Case("tau", helperResolveHTMLHexCharacterReference(0x003C4))
177 .Case("upsilon", helperResolveHTMLHexCharacterReference(0x003C5))
178 .Case("phi", helperResolveHTMLHexCharacterReference(0x003C6))
179 .Case("chi", helperResolveHTMLHexCharacterReference(0x003C7))
180 .Case("psi", helperResolveHTMLHexCharacterReference(0x003C8))
181 .Case("omega", helperResolveHTMLHexCharacterReference(0x003C9))
182 .Case("sigmaf", helperResolveHTMLHexCharacterReference(0x003C2))
183 .Case("sect", helperResolveHTMLHexCharacterReference(0x000A7))
184 .Case("deg", helperResolveHTMLHexCharacterReference(0x000B0))
185 .Case("prime", helperResolveHTMLHexCharacterReference(0x02032))
186 .Case("Prime", helperResolveHTMLHexCharacterReference(0x02033))
187 .Case("infin", helperResolveHTMLHexCharacterReference(0x0221E))
188 .Case("empty", helperResolveHTMLHexCharacterReference(0x02205))
189 .Case("plusmn", helperResolveHTMLHexCharacterReference(0x000B1))
190 .Case("times", helperResolveHTMLHexCharacterReference(0x000D7))
191 .Case("minus", helperResolveHTMLHexCharacterReference(0x02212))
192 .Case("sdot", helperResolveHTMLHexCharacterReference(0x022C5))
193 .Case("part", helperResolveHTMLHexCharacterReference(0x02202))
194 .Case("nabla", helperResolveHTMLHexCharacterReference(0x02207))
195 .Case("radic", helperResolveHTMLHexCharacterReference(0x0221A))
196 .Case("perp", helperResolveHTMLHexCharacterReference(0x022A5))
197 .Case("sum", helperResolveHTMLHexCharacterReference(0x02211))
198 .Case("int", helperResolveHTMLHexCharacterReference(0x0222B))
199 .Case("prod", helperResolveHTMLHexCharacterReference(0x0220F))
200 .Case("sim", helperResolveHTMLHexCharacterReference(0x0223C))
201 .Case("asymp", helperResolveHTMLHexCharacterReference(0x02248))
202 .Case("ne", helperResolveHTMLHexCharacterReference(0x02260))
203 .Case("equiv", helperResolveHTMLHexCharacterReference(0x02261))
204 .Case("prop", helperResolveHTMLHexCharacterReference(0x0221D))
205 .Case("le", helperResolveHTMLHexCharacterReference(0x02264))
206 .Case("ge", helperResolveHTMLHexCharacterReference(0x02265))
207 .Case("larr", helperResolveHTMLHexCharacterReference(0x02190))
208 .Case("rarr", helperResolveHTMLHexCharacterReference(0x02192))
209 .Case("isin", helperResolveHTMLHexCharacterReference(0x02208))
210 .Case("notin", helperResolveHTMLHexCharacterReference(0x02209))
211 .Case("lceil", helperResolveHTMLHexCharacterReference(0x02308))
212 .Case("rceil", helperResolveHTMLHexCharacterReference(0x02309))
213 .Case("lfloor", helperResolveHTMLHexCharacterReference(0x0230A))
214 .Case("rfloor", helperResolveHTMLHexCharacterReference(0x0230B))
215 .Default("");
216}
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000217
218StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
219 unsigned CodePoint = 0;
220 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
221 assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
222 CodePoint *= 10;
223 CodePoint += Name[i] - '0';
224 }
225
226 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
227 char *ResolvedPtr = Resolved;
228 if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
229 return StringRef(Resolved, ResolvedPtr - Resolved);
230 else
231 return StringRef();
232}
233
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000234void Lexer::skipLineStartingDecorations() {
235 // This function should be called only for C comments
236 assert(CommentState == LCS_InsideCComment);
237
238 if (BufferPtr == CommentEnd)
239 return;
240
241 switch (*BufferPtr) {
242 case ' ':
243 case '\t':
244 case '\f':
245 case '\v': {
246 const char *NewBufferPtr = BufferPtr;
247 NewBufferPtr++;
248 if (NewBufferPtr == CommentEnd)
249 return;
250
251 char C = *NewBufferPtr;
252 while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
253 NewBufferPtr++;
254 if (NewBufferPtr == CommentEnd)
255 return;
256 C = *NewBufferPtr;
257 }
258 if (C == '*')
259 BufferPtr = NewBufferPtr + 1;
260 break;
261 }
262 case '*':
263 BufferPtr++;
264 break;
265 }
266}
267
268namespace {
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000269/// Returns pointer to the first newline character in the string.
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000270const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
271 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
272 const char C = *BufferPtr;
273 if (C == '\n' || C == '\r')
274 return BufferPtr;
275 }
276 return BufferEnd;
277}
278
279const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
280 if (BufferPtr == BufferEnd)
281 return BufferPtr;
282
283 if (*BufferPtr == '\n')
284 BufferPtr++;
285 else {
286 assert(*BufferPtr == '\r');
287 BufferPtr++;
288 if (BufferPtr != BufferEnd && *BufferPtr == '\n')
289 BufferPtr++;
290 }
291 return BufferPtr;
292}
293
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000294const char *skipNamedCharacterReference(const char *BufferPtr,
295 const char *BufferEnd) {
296 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
297 if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
298 return BufferPtr;
299 }
300 return BufferEnd;
301}
302
303const char *skipDecimalCharacterReference(const char *BufferPtr,
304 const char *BufferEnd) {
305 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
306 if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
307 return BufferPtr;
308 }
309 return BufferEnd;
310}
311
312const char *skipHexCharacterReference(const char *BufferPtr,
313 const char *BufferEnd) {
314 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
315 if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
316 return BufferPtr;
317 }
318 return BufferEnd;
319}
320
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000321bool isHTMLIdentifierStartingCharacter(char C) {
322 return (C >= 'a' && C <= 'z') ||
323 (C >= 'A' && C <= 'Z');
324}
325
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000326bool isHTMLIdentifierCharacter(char C) {
327 return (C >= 'a' && C <= 'z') ||
328 (C >= 'A' && C <= 'Z') ||
329 (C >= '0' && C <= '9');
330}
331
332const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
333 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
334 if (!isHTMLIdentifierCharacter(*BufferPtr))
335 return BufferPtr;
336 }
337 return BufferEnd;
338}
339
340/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
341/// string allowed.
342///
343/// Returns pointer to closing quote.
344const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
345{
346 const char Quote = *BufferPtr;
347 assert(Quote == '\"' || Quote == '\'');
348
349 BufferPtr++;
350 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
351 const char C = *BufferPtr;
352 if (C == Quote && BufferPtr[-1] != '\\')
353 return BufferPtr;
354 }
355 return BufferEnd;
356}
357
358bool isHorizontalWhitespace(char C) {
359 return C == ' ' || C == '\t' || C == '\f' || C == '\v';
360}
361
362bool isWhitespace(char C) {
363 return C == ' ' || C == '\n' || C == '\r' ||
364 C == '\t' || C == '\f' || C == '\v';
365}
366
367const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
368 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
369 if (!isWhitespace(*BufferPtr))
370 return BufferPtr;
371 }
372 return BufferEnd;
373}
374
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000375bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
376 return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
377}
378
Dmitri Gribenko8c05da32012-09-14 16:35:35 +0000379bool isCommandNameStartCharacter(char C) {
380 return (C >= 'a' && C <= 'z') ||
381 (C >= 'A' && C <= 'Z');
382}
383
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000384bool isCommandNameCharacter(char C) {
385 return (C >= 'a' && C <= 'z') ||
386 (C >= 'A' && C <= 'Z') ||
387 (C >= '0' && C <= '9');
388}
389
390const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
391 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
392 if (!isCommandNameCharacter(*BufferPtr))
393 return BufferPtr;
394 }
395 return BufferEnd;
396}
397
398/// Return the one past end pointer for BCPL comments.
399/// Handles newlines escaped with backslash or trigraph for backslahs.
400const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
401 const char *CurPtr = BufferPtr;
402 while (CurPtr != BufferEnd) {
403 char C = *CurPtr;
404 while (C != '\n' && C != '\r') {
405 CurPtr++;
406 if (CurPtr == BufferEnd)
407 return BufferEnd;
408 C = *CurPtr;
409 }
410 // We found a newline, check if it is escaped.
411 const char *EscapePtr = CurPtr - 1;
412 while(isHorizontalWhitespace(*EscapePtr))
413 EscapePtr--;
414
415 if (*EscapePtr == '\\' ||
416 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
417 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
418 // We found an escaped newline.
419 CurPtr = skipNewline(CurPtr, BufferEnd);
420 } else
421 return CurPtr; // Not an escaped newline.
422 }
423 return BufferEnd;
424}
425
426/// Return the one past end pointer for C comments.
427/// Very dumb, does not handle escaped newlines or trigraphs.
428const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
429 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
430 if (*BufferPtr == '*') {
431 assert(BufferPtr + 1 != BufferEnd);
432 if (*(BufferPtr + 1) == '/')
433 return BufferPtr;
434 }
435 }
436 llvm_unreachable("buffer end hit before '*/' was seen");
437}
438} // unnamed namespace
439
440void Lexer::lexCommentText(Token &T) {
441 assert(CommentState == LCS_InsideBCPLComment ||
442 CommentState == LCS_InsideCComment);
443
444 switch (State) {
445 case LS_Normal:
446 break;
447 case LS_VerbatimBlockFirstLine:
448 lexVerbatimBlockFirstLine(T);
449 return;
450 case LS_VerbatimBlockBody:
451 lexVerbatimBlockBody(T);
452 return;
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000453 case LS_VerbatimLineText:
454 lexVerbatimLineText(T);
455 return;
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000456 case LS_HTMLStartTag:
457 lexHTMLStartTag(T);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000458 return;
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000459 case LS_HTMLEndTag:
460 lexHTMLEndTag(T);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000461 return;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000462 }
463
464 assert(State == LS_Normal);
465
466 const char *TokenPtr = BufferPtr;
467 assert(TokenPtr < CommentEnd);
468 while (TokenPtr != CommentEnd) {
469 switch(*TokenPtr) {
470 case '\\':
471 case '@': {
472 TokenPtr++;
473 if (TokenPtr == CommentEnd) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000474 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000475 return;
476 }
477 char C = *TokenPtr;
478 switch (C) {
479 default:
480 break;
481
482 case '\\': case '@': case '&': case '$':
483 case '#': case '<': case '>': case '%':
484 case '\"': case '.': case ':':
485 // This is one of \\ \@ \& \$ etc escape sequences.
486 TokenPtr++;
487 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
488 // This is the \:: escape sequence.
489 TokenPtr++;
490 }
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000491 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000492 formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000493 T.setText(UnescapedText);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000494 return;
495 }
496
497 // Don't make zero-length commands.
Dmitri Gribenko8c05da32012-09-14 16:35:35 +0000498 if (!isCommandNameStartCharacter(*TokenPtr)) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000499 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000500 return;
501 }
502
503 TokenPtr = skipCommandName(TokenPtr, CommentEnd);
504 unsigned Length = TokenPtr - (BufferPtr + 1);
505
506 // Hardcoded support for lexing LaTeX formula commands
507 // \f$ \f[ \f] \f{ \f} as a single command.
508 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
509 C = *TokenPtr;
510 if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
511 TokenPtr++;
512 Length++;
513 }
514 }
515
516 const StringRef CommandName(BufferPtr + 1, Length);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000517
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000518 const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
519 if (!Info) {
520 formTokenWithChars(T, TokenPtr, tok::unknown_command);
521 T.setUnknownCommandName(CommandName);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000522 return;
523 }
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000524 if (Info->IsVerbatimBlockCommand) {
525 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
526 return;
527 }
528 if (Info->IsVerbatimLineCommand) {
529 setupAndLexVerbatimLine(T, TokenPtr, Info);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000530 return;
531 }
532 formTokenWithChars(T, TokenPtr, tok::command);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000533 T.setCommandID(Info->getID());
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000534 return;
535 }
536
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000537 case '&':
538 lexHTMLCharacterReference(T);
539 return;
540
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000541 case '<': {
542 TokenPtr++;
543 if (TokenPtr == CommentEnd) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000544 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000545 return;
546 }
547 const char C = *TokenPtr;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000548 if (isHTMLIdentifierStartingCharacter(C))
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000549 setupAndLexHTMLStartTag(T);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000550 else if (C == '/')
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000551 setupAndLexHTMLEndTag(T);
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000552 else
553 formTextToken(T, TokenPtr);
554
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000555 return;
556 }
557
558 case '\n':
559 case '\r':
560 TokenPtr = skipNewline(TokenPtr, CommentEnd);
561 formTokenWithChars(T, TokenPtr, tok::newline);
562
563 if (CommentState == LCS_InsideCComment)
564 skipLineStartingDecorations();
565 return;
566
567 default: {
Dmitri Gribenkoaa7dbaf2012-12-30 19:45:46 +0000568 size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
569 find_first_of("\n\r\\@&<");
570 if (End != StringRef::npos)
571 TokenPtr += End;
572 else
573 TokenPtr = CommentEnd;
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000574 formTextToken(T, TokenPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000575 return;
576 }
577 }
578 }
579}
580
581void Lexer::setupAndLexVerbatimBlock(Token &T,
582 const char *TextBegin,
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000583 char Marker, const CommandInfo *Info) {
584 assert(Info->IsVerbatimBlockCommand);
585
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000586 VerbatimBlockEndCommandName.clear();
587 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000588 VerbatimBlockEndCommandName.append(Info->EndCommandName);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000589
590 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000591 T.setVerbatimBlockID(Info->getID());
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000592
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000593 // If there is a newline following the verbatim opening command, skip the
594 // newline so that we don't create an tok::verbatim_block_line with empty
595 // text content.
596 if (BufferPtr != CommentEnd) {
597 const char C = *BufferPtr;
598 if (C == '\n' || C == '\r') {
599 BufferPtr = skipNewline(BufferPtr, CommentEnd);
600 State = LS_VerbatimBlockBody;
601 return;
602 }
603 }
604
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000605 State = LS_VerbatimBlockFirstLine;
606}
607
608void Lexer::lexVerbatimBlockFirstLine(Token &T) {
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000609again:
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000610 assert(BufferPtr < CommentEnd);
611
612 // FIXME: It would be better to scan the text once, finding either the block
613 // end command or newline.
614 //
615 // Extract current line.
616 const char *Newline = findNewline(BufferPtr, CommentEnd);
617 StringRef Line(BufferPtr, Newline - BufferPtr);
618
619 // Look for end command in current line.
620 size_t Pos = Line.find(VerbatimBlockEndCommandName);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000621 const char *TextEnd;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000622 const char *NextLine;
623 if (Pos == StringRef::npos) {
624 // Current line is completely verbatim.
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000625 TextEnd = Newline;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000626 NextLine = skipNewline(Newline, CommentEnd);
627 } else if (Pos == 0) {
628 // Current line contains just an end command.
629 const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000630 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000631 formTokenWithChars(T, End, tok::verbatim_block_end);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000632 T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000633 State = LS_Normal;
634 return;
635 } else {
636 // There is some text, followed by end command. Extract text first.
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000637 TextEnd = BufferPtr + Pos;
638 NextLine = TextEnd;
Dmitri Gribenko64da4e52012-07-18 23:01:58 +0000639 // If there is only whitespace before end command, skip whitespace.
640 if (isWhitespace(BufferPtr, TextEnd)) {
641 BufferPtr = TextEnd;
642 goto again;
643 }
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000644 }
645
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000646 StringRef Text(BufferPtr, TextEnd - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000647 formTokenWithChars(T, NextLine, tok::verbatim_block_line);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000648 T.setVerbatimBlockText(Text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000649
650 State = LS_VerbatimBlockBody;
651}
652
653void Lexer::lexVerbatimBlockBody(Token &T) {
654 assert(State == LS_VerbatimBlockBody);
655
656 if (CommentState == LCS_InsideCComment)
657 skipLineStartingDecorations();
658
659 lexVerbatimBlockFirstLine(T);
660}
661
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000662void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
663 const CommandInfo *Info) {
664 assert(Info->IsVerbatimLineCommand);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000665 formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
Dmitri Gribenkoe4330a32012-09-10 20:32:42 +0000666 T.setVerbatimLineID(Info->getID());
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000667
668 State = LS_VerbatimLineText;
669}
670
671void Lexer::lexVerbatimLineText(Token &T) {
672 assert(State == LS_VerbatimLineText);
673
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000674 // Extract current line.
675 const char *Newline = findNewline(BufferPtr, CommentEnd);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000676 const StringRef Text(BufferPtr, Newline - BufferPtr);
677 formTokenWithChars(T, Newline, tok::verbatim_line_text);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000678 T.setVerbatimLineText(Text);
Dmitri Gribenko962668d2012-06-27 16:53:58 +0000679
680 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000681}
682
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000683void Lexer::lexHTMLCharacterReference(Token &T) {
684 const char *TokenPtr = BufferPtr;
685 assert(*TokenPtr == '&');
686 TokenPtr++;
687 if (TokenPtr == CommentEnd) {
688 formTextToken(T, TokenPtr);
689 return;
690 }
691 const char *NamePtr;
692 bool isNamed = false;
693 bool isDecimal = false;
694 char C = *TokenPtr;
695 if (isHTMLNamedCharacterReferenceCharacter(C)) {
696 NamePtr = TokenPtr;
697 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
698 isNamed = true;
699 } else if (C == '#') {
700 TokenPtr++;
701 if (TokenPtr == CommentEnd) {
702 formTextToken(T, TokenPtr);
703 return;
704 }
705 C = *TokenPtr;
706 if (isHTMLDecimalCharacterReferenceCharacter(C)) {
707 NamePtr = TokenPtr;
708 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
709 isDecimal = true;
710 } else if (C == 'x' || C == 'X') {
711 TokenPtr++;
712 NamePtr = TokenPtr;
713 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
714 } else {
715 formTextToken(T, TokenPtr);
716 return;
717 }
718 } else {
719 formTextToken(T, TokenPtr);
720 return;
721 }
722 if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
723 *TokenPtr != ';') {
724 formTextToken(T, TokenPtr);
725 return;
726 }
727 StringRef Name(NamePtr, TokenPtr - NamePtr);
728 TokenPtr++; // Skip semicolon.
729 StringRef Resolved;
Fariborz Jahanian658a1152013-01-29 23:42:26 +0000730 if (isNamed) {
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000731 Resolved = resolveHTMLNamedCharacterReference(Name);
Fariborz Jahanian658a1152013-01-29 23:42:26 +0000732 if (Resolved.empty()) {
733 Resolved = HTMLDoxygenCharacterReference(Name);
734 if (!Resolved.empty()) {
735 formTokenWithChars(T, TokenPtr, tok::text);
736 T.setText(Resolved);
737 return;
738 }
739 }
740 }
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000741 else if (isDecimal)
742 Resolved = resolveHTMLDecimalCharacterReference(Name);
743 else
744 Resolved = resolveHTMLHexCharacterReference(Name);
745
746 if (Resolved.empty()) {
747 formTextToken(T, TokenPtr);
748 return;
749 }
750 formTokenWithChars(T, TokenPtr, tok::text);
751 T.setText(Resolved);
752 return;
753}
754
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000755void Lexer::setupAndLexHTMLStartTag(Token &T) {
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000756 assert(BufferPtr[0] == '<' &&
757 isHTMLIdentifierStartingCharacter(BufferPtr[1]));
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000758 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000759 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000760 if (!isHTMLTagName(Name)) {
761 formTextToken(T, TagNameEnd);
762 return;
763 }
764
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000765 formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
766 T.setHTMLTagStartName(Name);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000767
768 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
769
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000770 const char C = *BufferPtr;
771 if (BufferPtr != CommentEnd &&
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000772 (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000773 State = LS_HTMLStartTag;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000774}
775
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000776void Lexer::lexHTMLStartTag(Token &T) {
777 assert(State == LS_HTMLStartTag);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000778
779 const char *TokenPtr = BufferPtr;
780 char C = *TokenPtr;
781 if (isHTMLIdentifierCharacter(C)) {
782 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000783 StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000784 formTokenWithChars(T, TokenPtr, tok::html_ident);
Dmitri Gribenkof5e0aea2012-06-27 16:30:35 +0000785 T.setHTMLIdent(Ident);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000786 } else {
787 switch (C) {
788 case '=':
789 TokenPtr++;
790 formTokenWithChars(T, TokenPtr, tok::html_equals);
791 break;
792 case '\"':
793 case '\'': {
794 const char *OpenQuote = TokenPtr;
795 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
796 const char *ClosingQuote = TokenPtr;
797 if (TokenPtr != CommentEnd) // Skip closing quote.
798 TokenPtr++;
799 formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
800 T.setHTMLQuotedString(StringRef(OpenQuote + 1,
801 ClosingQuote - (OpenQuote + 1)));
802 break;
803 }
804 case '>':
805 TokenPtr++;
806 formTokenWithChars(T, TokenPtr, tok::html_greater);
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000807 State = LS_Normal;
808 return;
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000809 case '/':
810 TokenPtr++;
811 if (TokenPtr != CommentEnd && *TokenPtr == '>') {
812 TokenPtr++;
813 formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
Dmitri Gribenko477a9f52012-07-27 20:37:06 +0000814 } else
815 formTextToken(T, TokenPtr);
816
Dmitri Gribenkoa5ef44f2012-07-11 21:38:39 +0000817 State = LS_Normal;
818 return;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000819 }
820 }
821
822 // Now look ahead and return to normal state if we don't see any HTML tokens
823 // ahead.
824 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
825 if (BufferPtr == CommentEnd) {
826 State = LS_Normal;
827 return;
828 }
829
830 C = *BufferPtr;
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000831 if (!isHTMLIdentifierStartingCharacter(C) &&
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000832 C != '=' && C != '\"' && C != '\'' && C != '>') {
833 State = LS_Normal;
834 return;
835 }
836}
837
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000838void Lexer::setupAndLexHTMLEndTag(Token &T) {
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000839 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
840
841 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
842 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000843 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
844 if (!isHTMLTagName(Name)) {
845 formTextToken(T, TagNameEnd);
846 return;
847 }
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000848
849 const char *End = skipWhitespace(TagNameEnd, CommentEnd);
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000850
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000851 formTokenWithChars(T, End, tok::html_end_tag);
Dmitri Gribenko834a5bd2012-08-22 22:56:08 +0000852 T.setHTMLTagEndName(Name);
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000853
854 if (BufferPtr != CommentEnd && *BufferPtr == '>')
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000855 State = LS_HTMLEndTag;
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000856}
857
Dmitri Gribenko3f38bf22012-07-13 00:44:24 +0000858void Lexer::lexHTMLEndTag(Token &T) {
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000859 assert(BufferPtr != CommentEnd && *BufferPtr == '>');
860
861 formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
862 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000863}
864
Dmitri Gribenkoaa580812012-08-09 00:03:17 +0000865Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
Dmitri Gribenkoaf503a62012-08-31 10:35:30 +0000866 SourceLocation FileLoc,
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000867 const char *BufferStart, const char *BufferEnd):
Dmitri Gribenkoaa580812012-08-09 00:03:17 +0000868 Allocator(Allocator), Traits(Traits),
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000869 BufferStart(BufferStart), BufferEnd(BufferEnd),
Dmitri Gribenkoaf503a62012-08-31 10:35:30 +0000870 FileLoc(FileLoc), BufferPtr(BufferStart),
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000871 CommentState(LCS_BeforeComment), State(LS_Normal) {
872}
873
874void Lexer::lex(Token &T) {
875again:
876 switch (CommentState) {
877 case LCS_BeforeComment:
878 if (BufferPtr == BufferEnd) {
879 formTokenWithChars(T, BufferPtr, tok::eof);
880 return;
881 }
882
883 assert(*BufferPtr == '/');
884 BufferPtr++; // Skip first slash.
885 switch(*BufferPtr) {
886 case '/': { // BCPL comment.
887 BufferPtr++; // Skip second slash.
888
889 if (BufferPtr != BufferEnd) {
890 // Skip Doxygen magic marker, if it is present.
891 // It might be missing because of a typo //< or /*<, or because we
892 // merged this non-Doxygen comment into a bunch of Doxygen comments
893 // around it: /** ... */ /* ... */ /** ... */
894 const char C = *BufferPtr;
895 if (C == '/' || C == '!')
896 BufferPtr++;
897 }
898
899 // Skip less-than symbol that marks trailing comments.
900 // Skip it even if the comment is not a Doxygen one, because //< and /*<
901 // are frequent typos.
902 if (BufferPtr != BufferEnd && *BufferPtr == '<')
903 BufferPtr++;
904
905 CommentState = LCS_InsideBCPLComment;
Dmitri Gribenko8d3ba232012-07-06 00:28:32 +0000906 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
907 State = LS_Normal;
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000908 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
909 goto again;
910 }
911 case '*': { // C comment.
912 BufferPtr++; // Skip star.
913
914 // Skip Doxygen magic marker.
915 const char C = *BufferPtr;
916 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
917 BufferPtr++;
918
919 // Skip less-than symbol that marks trailing comments.
920 if (BufferPtr != BufferEnd && *BufferPtr == '<')
921 BufferPtr++;
922
923 CommentState = LCS_InsideCComment;
924 State = LS_Normal;
925 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
926 goto again;
927 }
928 default:
929 llvm_unreachable("second character of comment should be '/' or '*'");
930 }
931
932 case LCS_BetweenComments: {
933 // Consecutive comments are extracted only if there is only whitespace
934 // between them. So we can search for the start of the next comment.
935 const char *EndWhitespace = BufferPtr;
936 while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
937 EndWhitespace++;
938
939 // Turn any whitespace between comments (and there is only whitespace
Dmitri Gribenkoa99ec102012-07-09 21:32:40 +0000940 // between them -- guaranteed by comment extraction) into a newline. We
941 // have two newlines between C comments in total (first one was synthesized
942 // after a comment).
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000943 formTokenWithChars(T, EndWhitespace, tok::newline);
944
945 CommentState = LCS_BeforeComment;
946 break;
947 }
948
949 case LCS_InsideBCPLComment:
950 case LCS_InsideCComment:
951 if (BufferPtr != CommentEnd) {
952 lexCommentText(T);
953 break;
954 } else {
955 // Skip C comment closing sequence.
956 if (CommentState == LCS_InsideCComment) {
957 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
958 BufferPtr += 2;
959 assert(BufferPtr <= BufferEnd);
960
961 // Synthenize newline just after the C comment, regardless if there is
962 // actually a newline.
963 formTokenWithChars(T, BufferPtr, tok::newline);
964
965 CommentState = LCS_BetweenComments;
966 break;
967 } else {
968 // Don't synthesized a newline after BCPL comment.
969 CommentState = LCS_BetweenComments;
970 goto again;
971 }
972 }
973 }
974}
975
976StringRef Lexer::getSpelling(const Token &Tok,
977 const SourceManager &SourceMgr,
978 bool *Invalid) const {
979 SourceLocation Loc = Tok.getLocation();
980 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
981
982 bool InvalidTemp = false;
983 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
984 if (InvalidTemp) {
985 *Invalid = true;
986 return StringRef();
987 }
988
989 const char *Begin = File.data() + LocInfo.second;
990 return StringRef(Begin, Tok.getLength());
991}
992
Dmitri Gribenko2d44d772012-06-26 20:39:18 +0000993} // end namespace comments
994} // end namespace clang
995