Blame - clang/lib/AST/CommentLexer.cpp - toolchain/llvm-project

blob: 57bfef08df6ea679ab39f2c9015d977212afd505 [file] [log] [blame]

Eugene Zelenko	0a4f3f4	2016-02-10 19:11:58 +0000	[diff] [blame]	1	//===--- CommentLexer.cpp -------------------------------------------------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	10	#include "clang/AST/CommentLexer.h"
Dmitri Gribenko	ca7f80a	2012-08-09 00:03:17 +0000	[diff] [blame]	11	#include "clang/AST/CommentCommandTraits.h"
Fariborz Jahanian	6738e43	2013-05-04 00:47:28 +0000	[diff] [blame]	12	#include "clang/AST/CommentDiagnostic.h"
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	13	#include "clang/Basic/CharInfo.h"
Dmitri Gribenko	b2e5482	2013-01-19 22:06:05 +0000	[diff] [blame]	14	#include "llvm/ADT/StringExtras.h"
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	15	#include "llvm/ADT/StringSwitch.h"
Dmitri Gribenko	9feeef4	2013-01-30 12:06:08 +0000	[diff] [blame]	16	#include "llvm/Support/ConvertUTF.h"
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	17	#include "llvm/Support/ErrorHandling.h"
				18
				19	namespace clang {
				20	namespace comments {
				21
				22	void Token::dump(const Lexer &L, const SourceManager &SM) const {
				23	llvm::errs() << "comments::Token Kind=" << Kind << " ";
				24	Loc.dump(SM);
				25	llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
				26	}
				27
Dmitri Gribenko	74f4d02	2013-02-10 11:54:22 +0000	[diff] [blame]	28	static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	29	return isLetter(C);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	30	}
				31
Dmitri Gribenko	74f4d02	2013-02-10 11:54:22 +0000	[diff] [blame]	32	static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	33	return isDigit(C);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	34	}
				35
Dmitri Gribenko	74f4d02	2013-02-10 11:54:22 +0000	[diff] [blame]	36	static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	37	return isHexDigit(C);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	38	}
Dmitri Gribenko	107618a	2012-08-22 22:56:08 +0000	[diff] [blame]	39
Dmitri Gribenko	74f4d02	2013-02-10 11:54:22 +0000	[diff] [blame]	40	static inline StringRef convertCodePointToUTF8(
				41	llvm::BumpPtrAllocator &Allocator,
				42	unsigned CodePoint) {
Fariborz Jahanian	7b3ae19	2013-01-29 23:42:26 +0000	[diff] [blame]	43	char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
				44	char *ResolvedPtr = Resolved;
Dmitri Gribenko	9feeef4	2013-01-30 12:06:08 +0000	[diff] [blame]	45	if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
Fariborz Jahanian	7b3ae19	2013-01-29 23:42:26 +0000	[diff] [blame]	46	return StringRef(Resolved, ResolvedPtr - Resolved);
				47	else
				48	return StringRef();
				49	}
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	50
Dmitri Gribenko	74f4d02	2013-02-10 11:54:22 +0000	[diff] [blame]	51	namespace {
				52
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	53	#include "clang/AST/CommentHTMLTags.inc"
				54	#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
				55
Eugene Zelenko	0a4f3f4	2016-02-10 19:11:58 +0000	[diff] [blame]	56	} // end anonymous namespace
Fariborz Jahanian	7b3ae19	2013-01-29 23:42:26 +0000	[diff] [blame]	57
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	58	StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	59	// Fast path, first check a few most widely used named character references.
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	60	return llvm::StringSwitch<StringRef>(Name)
				61	.Case("amp", "&")
				62	.Case("lt", "<")
				63	.Case("gt", ">")
				64	.Case("quot", "\"")
				65	.Case("apos", "\'")
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	66	// Slow path.
				67	.Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
Fariborz Jahanian	7b3ae19	2013-01-29 23:42:26 +0000	[diff] [blame]	68	}
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	69
				70	StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
				71	unsigned CodePoint = 0;
				72	for (unsigned i = 0, e = Name.size(); i != e; ++i) {
				73	assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
				74	CodePoint *= 10;
				75	CodePoint += Name[i] - '0';
				76	}
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	77	return convertCodePointToUTF8(Allocator, CodePoint);
				78	}
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	79
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	80	StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
				81	unsigned CodePoint = 0;
				82	for (unsigned i = 0, e = Name.size(); i != e; ++i) {
				83	CodePoint *= 16;
				84	const char C = Name[i];
				85	assert(isHTMLHexCharacterReferenceCharacter(C));
				86	CodePoint += llvm::hexDigitValue(C);
				87	}
				88	return convertCodePointToUTF8(Allocator, CodePoint);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	89	}
				90
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	91	void Lexer::skipLineStartingDecorations() {
				92	// This function should be called only for C comments
				93	assert(CommentState == LCS_InsideCComment);
				94
				95	if (BufferPtr == CommentEnd)
				96	return;
				97
				98	switch (*BufferPtr) {
				99	case ' ':
				100	case '\t':
				101	case '\f':
				102	case '\v': {
				103	const char *NewBufferPtr = BufferPtr;
				104	NewBufferPtr++;
				105	if (NewBufferPtr == CommentEnd)
				106	return;
				107
				108	char C = *NewBufferPtr;
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	109	while (isHorizontalWhitespace(C)) {
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	110	NewBufferPtr++;
				111	if (NewBufferPtr == CommentEnd)
				112	return;
				113	C = *NewBufferPtr;
				114	}
				115	if (C == '*')
				116	BufferPtr = NewBufferPtr + 1;
				117	break;
				118	}
				119	case '*':
				120	BufferPtr++;
				121	break;
				122	}
				123	}
				124
				125	namespace {
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	126	/// Returns pointer to the first newline character in the string.
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	127	const char findNewline(const char BufferPtr, const char *BufferEnd) {
				128	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	129	if (isVerticalWhitespace(*BufferPtr))
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	130	return BufferPtr;
				131	}
				132	return BufferEnd;
				133	}
				134
				135	const char skipNewline(const char BufferPtr, const char *BufferEnd) {
				136	if (BufferPtr == BufferEnd)
				137	return BufferPtr;
				138
				139	if (*BufferPtr == '\n')
				140	BufferPtr++;
				141	else {
				142	assert(*BufferPtr == '\r');
				143	BufferPtr++;
				144	if (BufferPtr != BufferEnd && *BufferPtr == '\n')
				145	BufferPtr++;
				146	}
				147	return BufferPtr;
				148	}
				149
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	150	const char skipNamedCharacterReference(const char BufferPtr,
				151	const char *BufferEnd) {
				152	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				153	if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
				154	return BufferPtr;
				155	}
				156	return BufferEnd;
				157	}
				158
				159	const char skipDecimalCharacterReference(const char BufferPtr,
				160	const char *BufferEnd) {
				161	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				162	if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
				163	return BufferPtr;
				164	}
				165	return BufferEnd;
				166	}
				167
				168	const char skipHexCharacterReference(const char BufferPtr,
Dmitri Gribenko	11f54e8	2013-08-23 17:48:41 +0000	[diff] [blame]	169	const char *BufferEnd) {
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	170	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				171	if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
				172	return BufferPtr;
				173	}
				174	return BufferEnd;
				175	}
				176
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	177	bool isHTMLIdentifierStartingCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	178	return isLetter(C);
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	179	}
				180
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	181	bool isHTMLIdentifierCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	182	return isAlphanumeric(C);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	183	}
				184
				185	const char skipHTMLIdentifier(const char BufferPtr, const char *BufferEnd) {
				186	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				187	if (!isHTMLIdentifierCharacter(*BufferPtr))
				188	return BufferPtr;
				189	}
				190	return BufferEnd;
				191	}
				192
				193	/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
				194	/// string allowed.
				195	///
				196	/// Returns pointer to closing quote.
				197	const char skipHTMLQuotedString(const char BufferPtr, const char *BufferEnd)
				198	{
				199	const char Quote = *BufferPtr;
				200	assert(Quote == '\"' \|\| Quote == '\'');
				201
				202	BufferPtr++;
				203	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				204	const char C = *BufferPtr;
				205	if (C == Quote && BufferPtr[-1] != '\\')
				206	return BufferPtr;
				207	}
				208	return BufferEnd;
				209	}
				210
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	211	const char skipWhitespace(const char BufferPtr, const char *BufferEnd) {
				212	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				213	if (!isWhitespace(*BufferPtr))
				214	return BufferPtr;
				215	}
				216	return BufferEnd;
				217	}
				218
Dmitri Gribenko	e4a3997	2012-07-18 23:01:58 +0000	[diff] [blame]	219	bool isWhitespace(const char BufferPtr, const char BufferEnd) {
				220	return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
				221	}
				222
Dmitri Gribenko	ad45ad6	2012-09-14 16:35:35 +0000	[diff] [blame]	223	bool isCommandNameStartCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	224	return isLetter(C);
Dmitri Gribenko	ad45ad6	2012-09-14 16:35:35 +0000	[diff] [blame]	225	}
				226
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	227	bool isCommandNameCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	228	return isAlphanumeric(C);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	229	}
				230
				231	const char skipCommandName(const char BufferPtr, const char *BufferEnd) {
				232	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				233	if (!isCommandNameCharacter(*BufferPtr))
				234	return BufferPtr;
				235	}
				236	return BufferEnd;
				237	}
				238
				239	/// Return the one past end pointer for BCPL comments.
				240	/// Handles newlines escaped with backslash or trigraph for backslahs.
				241	const char findBCPLCommentEnd(const char BufferPtr, const char *BufferEnd) {
				242	const char *CurPtr = BufferPtr;
				243	while (CurPtr != BufferEnd) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	244	while (!isVerticalWhitespace(*CurPtr)) {
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	245	CurPtr++;
				246	if (CurPtr == BufferEnd)
				247	return BufferEnd;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	248	}
				249	// We found a newline, check if it is escaped.
				250	const char *EscapePtr = CurPtr - 1;
				251	while(isHorizontalWhitespace(*EscapePtr))
				252	EscapePtr--;
				253
				254	if (*EscapePtr == '\\' \|\|
				255	(EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
				256	EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
				257	// We found an escaped newline.
				258	CurPtr = skipNewline(CurPtr, BufferEnd);
				259	} else
				260	return CurPtr; // Not an escaped newline.
				261	}
				262	return BufferEnd;
				263	}
				264
				265	/// Return the one past end pointer for C comments.
				266	/// Very dumb, does not handle escaped newlines or trigraphs.
				267	const char findCCommentEnd(const char BufferPtr, const char *BufferEnd) {
				268	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				269	if (BufferPtr == '') {
				270	assert(BufferPtr + 1 != BufferEnd);
				271	if (*(BufferPtr + 1) == '/')
				272	return BufferPtr;
				273	}
				274	}
				275	llvm_unreachable("buffer end hit before '*/' was seen");
				276	}
Fariborz Jahanian	6c7a166	2013-05-08 19:21:00 +0000	[diff] [blame]	277
Eugene Zelenko	0a4f3f4	2016-02-10 19:11:58 +0000	[diff] [blame]	278	} // end anonymous namespace
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	279
Alp Toker	3ffab05	2013-12-07 13:51:26 +0000	[diff] [blame]	280	void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
				281	tok::TokenKind Kind) {
				282	const unsigned TokLen = TokEnd - BufferPtr;
				283	Result.setLocation(getSourceLocation(BufferPtr));
				284	Result.setKind(Kind);
				285	Result.setLength(TokLen);
				286	#ifndef NDEBUG
				287	Result.TextPtr = "<UNSET>";
				288	Result.IntVal = 7;
				289	#endif
				290	BufferPtr = TokEnd;
				291	}
				292
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	293	void Lexer::lexCommentText(Token &T) {
				294	assert(CommentState == LCS_InsideBCPLComment \|\|
				295	CommentState == LCS_InsideCComment);
				296
				297	switch (State) {
				298	case LS_Normal:
				299	break;
				300	case LS_VerbatimBlockFirstLine:
				301	lexVerbatimBlockFirstLine(T);
				302	return;
				303	case LS_VerbatimBlockBody:
				304	lexVerbatimBlockBody(T);
				305	return;
Dmitri Gribenko	1669f70	2012-06-27 16:53:58 +0000	[diff] [blame]	306	case LS_VerbatimLineText:
				307	lexVerbatimLineText(T);
				308	return;
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	309	case LS_HTMLStartTag:
				310	lexHTMLStartTag(T);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	311	return;
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	312	case LS_HTMLEndTag:
				313	lexHTMLEndTag(T);
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	314	return;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	315	}
				316
				317	assert(State == LS_Normal);
				318
				319	const char *TokenPtr = BufferPtr;
				320	assert(TokenPtr < CommentEnd);
				321	while (TokenPtr != CommentEnd) {
				322	switch(*TokenPtr) {
				323	case '\\':
				324	case '@': {
Dmitri Gribenko	bcf7f4d	2013-03-04 23:06:15 +0000	[diff] [blame]	325	// Commands that start with a backslash and commands that start with
				326	// 'at' have equivalent semantics. But we keep information about the
				327	// exact syntax in AST for comments.
				328	tok::TokenKind CommandKind =
				329	(*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	330	TokenPtr++;
				331	if (TokenPtr == CommentEnd) {
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	332	formTextToken(T, TokenPtr);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	333	return;
				334	}
				335	char C = *TokenPtr;
				336	switch (C) {
				337	default:
				338	break;
				339
				340	case '\\': case '@': case '&': case '$':
				341	case '#': case '<': case '>': case '%':
				342	case '\"': case '.': case ':':
				343	// This is one of \\ \@ \& \$ etc escape sequences.
				344	TokenPtr++;
				345	if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
				346	// This is the \:: escape sequence.
				347	TokenPtr++;
				348	}
Dmitri Gribenko	60ddd8a	2012-06-27 16:30:35 +0000	[diff] [blame]	349	StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	350	formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenko	60ddd8a	2012-06-27 16:30:35 +0000	[diff] [blame]	351	T.setText(UnescapedText);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	352	return;
				353	}
				354
				355	// Don't make zero-length commands.
Dmitri Gribenko	ad45ad6	2012-09-14 16:35:35 +0000	[diff] [blame]	356	if (!isCommandNameStartCharacter(*TokenPtr)) {
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	357	formTextToken(T, TokenPtr);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	358	return;
				359	}
				360
				361	TokenPtr = skipCommandName(TokenPtr, CommentEnd);
				362	unsigned Length = TokenPtr - (BufferPtr + 1);
				363
				364	// Hardcoded support for lexing LaTeX formula commands
				365	// \f$ \f[ \f] \f{ \f} as a single command.
				366	if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
				367	C = *TokenPtr;
				368	if (C == '$' \|\| C == '[' \|\| C == ']' \|\| C == '{' \|\| C == '}') {
				369	TokenPtr++;
				370	Length++;
				371	}
				372	}
				373
Craig Topper	bf3e327	2014-08-30 16:55:52 +0000	[diff] [blame]	374	StringRef CommandName(BufferPtr + 1, Length);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	375
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	376	const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
				377	if (!Info) {
Fariborz Jahanian	ebb262f	2013-05-08 20:29:57 +0000	[diff] [blame]	378	if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
Fariborz Jahanian	6c7a166	2013-05-08 19:21:00 +0000	[diff] [blame]	379	StringRef CorrectedName = Info->Name;
Benjamin Kramer	2dece57	2013-12-01 15:09:32 +0000	[diff] [blame]	380	SourceLocation Loc = getSourceLocation(BufferPtr);
				381	SourceRange CommandRange(Loc.getLocWithOffset(1),
				382	getSourceLocation(TokenPtr));
				383	Diag(Loc, diag::warn_correct_comment_command_name)
Fariborz Jahanian	6c7a166	2013-05-08 19:21:00 +0000	[diff] [blame]	384	<< CommandName << CorrectedName
				385	<< FixItHint::CreateReplacement(CommandRange, CorrectedName);
				386	} else {
Benjamin Kramer	2dece57	2013-12-01 15:09:32 +0000	[diff] [blame]	387	formTokenWithChars(T, TokenPtr, tok::unknown_command);
				388	T.setUnknownCommandName(CommandName);
Fariborz Jahanian	6c7a166	2013-05-08 19:21:00 +0000	[diff] [blame]	389	Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
				390	return;
				391	}
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	392	}
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	393	if (Info->IsVerbatimBlockCommand) {
				394	setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
				395	return;
				396	}
				397	if (Info->IsVerbatimLineCommand) {
				398	setupAndLexVerbatimLine(T, TokenPtr, Info);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	399	return;
				400	}
Dmitri Gribenko	bcf7f4d	2013-03-04 23:06:15 +0000	[diff] [blame]	401	formTokenWithChars(T, TokenPtr, CommandKind);
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	402	T.setCommandID(Info->getID());
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	403	return;
				404	}
				405
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	406	case '&':
				407	lexHTMLCharacterReference(T);
				408	return;
				409
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	410	case '<': {
				411	TokenPtr++;
				412	if (TokenPtr == CommentEnd) {
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	413	formTextToken(T, TokenPtr);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	414	return;
				415	}
				416	const char C = *TokenPtr;
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	417	if (isHTMLIdentifierStartingCharacter(C))
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	418	setupAndLexHTMLStartTag(T);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	419	else if (C == '/')
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	420	setupAndLexHTMLEndTag(T);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	421	else
				422	formTextToken(T, TokenPtr);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	423	return;
				424	}
				425
				426	case '\n':
				427	case '\r':
				428	TokenPtr = skipNewline(TokenPtr, CommentEnd);
				429	formTokenWithChars(T, TokenPtr, tok::newline);
				430
				431	if (CommentState == LCS_InsideCComment)
				432	skipLineStartingDecorations();
				433	return;
				434
				435	default: {
Dmitri Gribenko	10af67a	2012-12-30 19:45:46 +0000	[diff] [blame]	436	size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
				437	find_first_of("\n\r\\@&<");
				438	if (End != StringRef::npos)
				439	TokenPtr += End;
				440	else
				441	TokenPtr = CommentEnd;
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	442	formTextToken(T, TokenPtr);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	443	return;
				444	}
				445	}
				446	}
				447	}
				448
				449	void Lexer::setupAndLexVerbatimBlock(Token &T,
				450	const char *TextBegin,
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	451	char Marker, const CommandInfo *Info) {
				452	assert(Info->IsVerbatimBlockCommand);
				453
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	454	VerbatimBlockEndCommandName.clear();
				455	VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	456	VerbatimBlockEndCommandName.append(Info->EndCommandName);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	457
				458	formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	459	T.setVerbatimBlockID(Info->getID());
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	460
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	461	// If there is a newline following the verbatim opening command, skip the
				462	// newline so that we don't create an tok::verbatim_block_line with empty
				463	// text content.
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	464	if (BufferPtr != CommentEnd &&
				465	isVerticalWhitespace(*BufferPtr)) {
				466	BufferPtr = skipNewline(BufferPtr, CommentEnd);
				467	State = LS_VerbatimBlockBody;
				468	return;
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	469	}
				470
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	471	State = LS_VerbatimBlockFirstLine;
				472	}
				473
				474	void Lexer::lexVerbatimBlockFirstLine(Token &T) {
Dmitri Gribenko	e4a3997	2012-07-18 23:01:58 +0000	[diff] [blame]	475	again:
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	476	assert(BufferPtr < CommentEnd);
				477
				478	// FIXME: It would be better to scan the text once, finding either the block
				479	// end command or newline.
				480	//
				481	// Extract current line.
				482	const char *Newline = findNewline(BufferPtr, CommentEnd);
				483	StringRef Line(BufferPtr, Newline - BufferPtr);
				484
				485	// Look for end command in current line.
				486	size_t Pos = Line.find(VerbatimBlockEndCommandName);
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	487	const char *TextEnd;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	488	const char *NextLine;
				489	if (Pos == StringRef::npos) {
				490	// Current line is completely verbatim.
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	491	TextEnd = Newline;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	492	NextLine = skipNewline(Newline, CommentEnd);
				493	} else if (Pos == 0) {
				494	// Current line contains just an end command.
				495	const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
Dmitri Gribenko	60ddd8a	2012-06-27 16:30:35 +0000	[diff] [blame]	496	StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	497	formTokenWithChars(T, End, tok::verbatim_block_end);
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	498	T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	499	State = LS_Normal;
				500	return;
				501	} else {
				502	// There is some text, followed by end command. Extract text first.
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	503	TextEnd = BufferPtr + Pos;
				504	NextLine = TextEnd;
Dmitri Gribenko	e4a3997	2012-07-18 23:01:58 +0000	[diff] [blame]	505	// If there is only whitespace before end command, skip whitespace.
				506	if (isWhitespace(BufferPtr, TextEnd)) {
				507	BufferPtr = TextEnd;
				508	goto again;
				509	}
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	510	}
				511
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	512	StringRef Text(BufferPtr, TextEnd - BufferPtr);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	513	formTokenWithChars(T, NextLine, tok::verbatim_block_line);
Dmitri Gribenko	60ddd8a	2012-06-27 16:30:35 +0000	[diff] [blame]	514	T.setVerbatimBlockText(Text);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	515
				516	State = LS_VerbatimBlockBody;
				517	}
				518
				519	void Lexer::lexVerbatimBlockBody(Token &T) {
				520	assert(State == LS_VerbatimBlockBody);
				521
				522	if (CommentState == LCS_InsideCComment)
				523	skipLineStartingDecorations();
				524
Dmitri Gribenko	8b72062	2015-04-15 23:45:43 +0000	[diff] [blame]	525	if (BufferPtr == CommentEnd) {
				526	formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
				527	T.setVerbatimBlockText("");
				528	return;
				529	}
				530
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	531	lexVerbatimBlockFirstLine(T);
				532	}
				533
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	534	void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
				535	const CommandInfo *Info) {
				536	assert(Info->IsVerbatimLineCommand);
Dmitri Gribenko	1669f70	2012-06-27 16:53:58 +0000	[diff] [blame]	537	formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	538	T.setVerbatimLineID(Info->getID());
Dmitri Gribenko	1669f70	2012-06-27 16:53:58 +0000	[diff] [blame]	539
				540	State = LS_VerbatimLineText;
				541	}
				542
				543	void Lexer::lexVerbatimLineText(Token &T) {
				544	assert(State == LS_VerbatimLineText);
				545
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	546	// Extract current line.
				547	const char *Newline = findNewline(BufferPtr, CommentEnd);
Craig Topper	bf3e327	2014-08-30 16:55:52 +0000	[diff] [blame]	548	StringRef Text(BufferPtr, Newline - BufferPtr);
Dmitri Gribenko	1669f70	2012-06-27 16:53:58 +0000	[diff] [blame]	549	formTokenWithChars(T, Newline, tok::verbatim_line_text);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	550	T.setVerbatimLineText(Text);
Dmitri Gribenko	1669f70	2012-06-27 16:53:58 +0000	[diff] [blame]	551
				552	State = LS_Normal;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	553	}
				554
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	555	void Lexer::lexHTMLCharacterReference(Token &T) {
				556	const char *TokenPtr = BufferPtr;
				557	assert(*TokenPtr == '&');
				558	TokenPtr++;
				559	if (TokenPtr == CommentEnd) {
				560	formTextToken(T, TokenPtr);
				561	return;
				562	}
				563	const char *NamePtr;
				564	bool isNamed = false;
				565	bool isDecimal = false;
				566	char C = *TokenPtr;
				567	if (isHTMLNamedCharacterReferenceCharacter(C)) {
				568	NamePtr = TokenPtr;
				569	TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
				570	isNamed = true;
				571	} else if (C == '#') {
				572	TokenPtr++;
				573	if (TokenPtr == CommentEnd) {
				574	formTextToken(T, TokenPtr);
				575	return;
				576	}
				577	C = *TokenPtr;
				578	if (isHTMLDecimalCharacterReferenceCharacter(C)) {
				579	NamePtr = TokenPtr;
				580	TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
				581	isDecimal = true;
				582	} else if (C == 'x' \|\| C == 'X') {
				583	TokenPtr++;
				584	NamePtr = TokenPtr;
				585	TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
				586	} else {
				587	formTextToken(T, TokenPtr);
				588	return;
				589	}
				590	} else {
				591	formTextToken(T, TokenPtr);
				592	return;
				593	}
				594	if (NamePtr == TokenPtr \|\| TokenPtr == CommentEnd \|\|
				595	*TokenPtr != ';') {
				596	formTextToken(T, TokenPtr);
				597	return;
				598	}
				599	StringRef Name(NamePtr, TokenPtr - NamePtr);
				600	TokenPtr++; // Skip semicolon.
				601	StringRef Resolved;
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	602	if (isNamed)
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	603	Resolved = resolveHTMLNamedCharacterReference(Name);
				604	else if (isDecimal)
				605	Resolved = resolveHTMLDecimalCharacterReference(Name);
				606	else
				607	Resolved = resolveHTMLHexCharacterReference(Name);
				608
				609	if (Resolved.empty()) {
				610	formTextToken(T, TokenPtr);
				611	return;
				612	}
				613	formTokenWithChars(T, TokenPtr, tok::text);
				614	T.setText(Resolved);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	615	}
				616
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	617	void Lexer::setupAndLexHTMLStartTag(Token &T) {
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	618	assert(BufferPtr[0] == '<' &&
				619	isHTMLIdentifierStartingCharacter(BufferPtr[1]));
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	620	const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
Dmitri Gribenko	60ddd8a	2012-06-27 16:30:35 +0000	[diff] [blame]	621	StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
Dmitri Gribenko	107618a	2012-08-22 22:56:08 +0000	[diff] [blame]	622	if (!isHTMLTagName(Name)) {
				623	formTextToken(T, TagNameEnd);
				624	return;
				625	}
				626
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	627	formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
				628	T.setHTMLTagStartName(Name);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	629
				630	BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
				631
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	632	const char C = *BufferPtr;
				633	if (BufferPtr != CommentEnd &&
Dmitri Gribenko	f26054f	2012-07-11 21:38:39 +0000	[diff] [blame]	634	(C == '>' \|\| C == '/' \|\| isHTMLIdentifierStartingCharacter(C)))
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	635	State = LS_HTMLStartTag;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	636	}
				637
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	638	void Lexer::lexHTMLStartTag(Token &T) {
				639	assert(State == LS_HTMLStartTag);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	640
				641	const char *TokenPtr = BufferPtr;
				642	char C = *TokenPtr;
				643	if (isHTMLIdentifierCharacter(C)) {
				644	TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
Dmitri Gribenko	60ddd8a	2012-06-27 16:30:35 +0000	[diff] [blame]	645	StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	646	formTokenWithChars(T, TokenPtr, tok::html_ident);
Dmitri Gribenko	60ddd8a	2012-06-27 16:30:35 +0000	[diff] [blame]	647	T.setHTMLIdent(Ident);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	648	} else {
				649	switch (C) {
				650	case '=':
				651	TokenPtr++;
				652	formTokenWithChars(T, TokenPtr, tok::html_equals);
				653	break;
				654	case '\"':
				655	case '\'': {
				656	const char *OpenQuote = TokenPtr;
				657	TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
				658	const char *ClosingQuote = TokenPtr;
				659	if (TokenPtr != CommentEnd) // Skip closing quote.
				660	TokenPtr++;
				661	formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
				662	T.setHTMLQuotedString(StringRef(OpenQuote + 1,
				663	ClosingQuote - (OpenQuote + 1)));
				664	break;
				665	}
				666	case '>':
				667	TokenPtr++;
				668	formTokenWithChars(T, TokenPtr, tok::html_greater);
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	669	State = LS_Normal;
				670	return;
Dmitri Gribenko	f26054f	2012-07-11 21:38:39 +0000	[diff] [blame]	671	case '/':
				672	TokenPtr++;
				673	if (TokenPtr != CommentEnd && *TokenPtr == '>') {
				674	TokenPtr++;
				675	formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	676	} else
				677	formTextToken(T, TokenPtr);
				678
Dmitri Gribenko	f26054f	2012-07-11 21:38:39 +0000	[diff] [blame]	679	State = LS_Normal;
				680	return;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	681	}
				682	}
				683
				684	// Now look ahead and return to normal state if we don't see any HTML tokens
				685	// ahead.
				686	BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
				687	if (BufferPtr == CommentEnd) {
				688	State = LS_Normal;
				689	return;
				690	}
				691
				692	C = *BufferPtr;
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	693	if (!isHTMLIdentifierStartingCharacter(C) &&
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	694	C != '=' && C != '\"' && C != '\'' && C != '>') {
				695	State = LS_Normal;
				696	return;
				697	}
				698	}
				699
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	700	void Lexer::setupAndLexHTMLEndTag(Token &T) {
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	701	assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
				702
				703	const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
				704	const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
Dmitri Gribenko	107618a	2012-08-22 22:56:08 +0000	[diff] [blame]	705	StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
				706	if (!isHTMLTagName(Name)) {
				707	formTextToken(T, TagNameEnd);
				708	return;
				709	}
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	710
				711	const char *End = skipWhitespace(TagNameEnd, CommentEnd);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	712
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	713	formTokenWithChars(T, End, tok::html_end_tag);
Dmitri Gribenko	107618a	2012-08-22 22:56:08 +0000	[diff] [blame]	714	T.setHTMLTagEndName(Name);
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	715
				716	if (BufferPtr != CommentEnd && *BufferPtr == '>')
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	717	State = LS_HTMLEndTag;
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	718	}
				719
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	720	void Lexer::lexHTMLEndTag(Token &T) {
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	721	assert(BufferPtr != CommentEnd && *BufferPtr == '>');
				722
				723	formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
				724	State = LS_Normal;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	725	}
				726
Fariborz Jahanian	5b63707	2013-05-03 23:15:20 +0000	[diff] [blame]	727	Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
				728	const CommandTraits &Traits,
Dmitri Gribenko	6bab911	2012-08-31 10:35:30 +0000	[diff] [blame]	729	SourceLocation FileLoc,
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	730	const char BufferStart, const char BufferEnd):
Fariborz Jahanian	5b63707	2013-05-03 23:15:20 +0000	[diff] [blame]	731	Allocator(Allocator), Diags(Diags), Traits(Traits),
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	732	BufferStart(BufferStart), BufferEnd(BufferEnd),
Dmitri Gribenko	6bab911	2012-08-31 10:35:30 +0000	[diff] [blame]	733	FileLoc(FileLoc), BufferPtr(BufferStart),
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	734	CommentState(LCS_BeforeComment), State(LS_Normal) {
				735	}
				736
				737	void Lexer::lex(Token &T) {
				738	again:
				739	switch (CommentState) {
				740	case LCS_BeforeComment:
				741	if (BufferPtr == BufferEnd) {
				742	formTokenWithChars(T, BufferPtr, tok::eof);
				743	return;
				744	}
				745
				746	assert(*BufferPtr == '/');
				747	BufferPtr++; // Skip first slash.
				748	switch(*BufferPtr) {
				749	case '/': { // BCPL comment.
				750	BufferPtr++; // Skip second slash.
				751
				752	if (BufferPtr != BufferEnd) {
				753	// Skip Doxygen magic marker, if it is present.
				754	// It might be missing because of a typo //< or /*<, or because we
				755	// merged this non-Doxygen comment into a bunch of Doxygen comments
				756	// around it: /** ... / / ... / /* ... */
				757	const char C = *BufferPtr;
				758	if (C == '/' \|\| C == '!')
				759	BufferPtr++;
				760	}
				761
				762	// Skip less-than symbol that marks trailing comments.
				763	// Skip it even if the comment is not a Doxygen one, because //< and /*<
				764	// are frequent typos.
				765	if (BufferPtr != BufferEnd && *BufferPtr == '<')
				766	BufferPtr++;
				767
				768	CommentState = LCS_InsideBCPLComment;
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	769	if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
				770	State = LS_Normal;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	771	CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
				772	goto again;
				773	}
				774	case '*': { // C comment.
				775	BufferPtr++; // Skip star.
				776
				777	// Skip Doxygen magic marker.
				778	const char C = *BufferPtr;
				779	if ((C == '' && (BufferPtr + 1) != '/') \|\| C == '!')
				780	BufferPtr++;
				781
				782	// Skip less-than symbol that marks trailing comments.
				783	if (BufferPtr != BufferEnd && *BufferPtr == '<')
				784	BufferPtr++;
				785
				786	CommentState = LCS_InsideCComment;
				787	State = LS_Normal;
				788	CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
				789	goto again;
				790	}
				791	default:
				792	llvm_unreachable("second character of comment should be '/' or '*'");
				793	}
				794
				795	case LCS_BetweenComments: {
				796	// Consecutive comments are extracted only if there is only whitespace
				797	// between them. So we can search for the start of the next comment.
				798	const char *EndWhitespace = BufferPtr;
				799	while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
				800	EndWhitespace++;
				801
				802	// Turn any whitespace between comments (and there is only whitespace
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	803	// between them -- guaranteed by comment extraction) into a newline. We
				804	// have two newlines between C comments in total (first one was synthesized
				805	// after a comment).
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	806	formTokenWithChars(T, EndWhitespace, tok::newline);
				807
				808	CommentState = LCS_BeforeComment;
				809	break;
				810	}
				811
				812	case LCS_InsideBCPLComment:
				813	case LCS_InsideCComment:
				814	if (BufferPtr != CommentEnd) {
				815	lexCommentText(T);
				816	break;
				817	} else {
				818	// Skip C comment closing sequence.
				819	if (CommentState == LCS_InsideCComment) {
				820	assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
				821	BufferPtr += 2;
				822	assert(BufferPtr <= BufferEnd);
				823
				824	// Synthenize newline just after the C comment, regardless if there is
				825	// actually a newline.
				826	formTokenWithChars(T, BufferPtr, tok::newline);
				827
				828	CommentState = LCS_BetweenComments;
				829	break;
				830	} else {
				831	// Don't synthesized a newline after BCPL comment.
				832	CommentState = LCS_BetweenComments;
				833	goto again;
				834	}
				835	}
				836	}
				837	}
				838
				839	StringRef Lexer::getSpelling(const Token &Tok,
				840	const SourceManager &SourceMgr,
				841	bool *Invalid) const {
				842	SourceLocation Loc = Tok.getLocation();
				843	std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
				844
				845	bool InvalidTemp = false;
				846	StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
				847	if (InvalidTemp) {
				848	*Invalid = true;
				849	return StringRef();
				850	}
				851
				852	const char *Begin = File.data() + LocInfo.second;
				853	return StringRef(Begin, Tok.getLength());
				854	}
				855
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	856	} // end namespace comments
				857	} // end namespace clang