Blame - clang/lib/AST/CommentLexer.cpp - toolchain/llvm-project

blob: 65d0f56f09ab382e0f42417ea574ffbf4a2c5a65 [file] [log] [blame]

Eugene Zelenko	0a4f3f4	2016-02-10 19:11:58 +0000	[diff] [blame]	1	//===--- CommentLexer.cpp -------------------------------------------------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	10	#include "clang/AST/CommentLexer.h"
Dmitri Gribenko	ca7f80a	2012-08-09 00:03:17 +0000	[diff] [blame]	11	#include "clang/AST/CommentCommandTraits.h"
Fariborz Jahanian	6738e43	2013-05-04 00:47:28 +0000	[diff] [blame]	12	#include "clang/AST/CommentDiagnostic.h"
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	13	#include "clang/Basic/CharInfo.h"
Dmitri Gribenko	b2e5482	2013-01-19 22:06:05 +0000	[diff] [blame]	14	#include "llvm/ADT/StringExtras.h"
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	15	#include "llvm/ADT/StringSwitch.h"
Dmitri Gribenko	9feeef4	2013-01-30 12:06:08 +0000	[diff] [blame]	16	#include "llvm/Support/ConvertUTF.h"
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	17	#include "llvm/Support/ErrorHandling.h"
				18
				19	namespace clang {
				20	namespace comments {
				21
				22	void Token::dump(const Lexer &L, const SourceManager &SM) const {
				23	llvm::errs() << "comments::Token Kind=" << Kind << " ";
				24	Loc.dump(SM);
				25	llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
				26	}
				27
Dmitri Gribenko	74f4d02	2013-02-10 11:54:22 +0000	[diff] [blame]	28	static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	29	return isLetter(C);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	30	}
				31
Dmitri Gribenko	74f4d02	2013-02-10 11:54:22 +0000	[diff] [blame]	32	static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	33	return isDigit(C);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	34	}
				35
Dmitri Gribenko	74f4d02	2013-02-10 11:54:22 +0000	[diff] [blame]	36	static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	37	return isHexDigit(C);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	38	}
Dmitri Gribenko	107618a	2012-08-22 22:56:08 +0000	[diff] [blame]	39
Dmitri Gribenko	74f4d02	2013-02-10 11:54:22 +0000	[diff] [blame]	40	static inline StringRef convertCodePointToUTF8(
				41	llvm::BumpPtrAllocator &Allocator,
				42	unsigned CodePoint) {
Fariborz Jahanian	7b3ae19	2013-01-29 23:42:26 +0000	[diff] [blame]	43	char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
				44	char *ResolvedPtr = Resolved;
Dmitri Gribenko	9feeef4	2013-01-30 12:06:08 +0000	[diff] [blame]	45	if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
Fariborz Jahanian	7b3ae19	2013-01-29 23:42:26 +0000	[diff] [blame]	46	return StringRef(Resolved, ResolvedPtr - Resolved);
				47	else
				48	return StringRef();
				49	}
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	50
Dmitri Gribenko	74f4d02	2013-02-10 11:54:22 +0000	[diff] [blame]	51	namespace {
				52
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	53	#include "clang/AST/CommentHTMLTags.inc"
				54	#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
				55
Eugene Zelenko	0a4f3f4	2016-02-10 19:11:58 +0000	[diff] [blame]	56	} // end anonymous namespace
Fariborz Jahanian	7b3ae19	2013-01-29 23:42:26 +0000	[diff] [blame]	57
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	58	StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	59	// Fast path, first check a few most widely used named character references.
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	60	return llvm::StringSwitch<StringRef>(Name)
				61	.Case("amp", "&")
				62	.Case("lt", "<")
				63	.Case("gt", ">")
				64	.Case("quot", "\"")
				65	.Case("apos", "\'")
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	66	// Slow path.
				67	.Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
Fariborz Jahanian	7b3ae19	2013-01-29 23:42:26 +0000	[diff] [blame]	68	}
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	69
				70	StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
				71	unsigned CodePoint = 0;
				72	for (unsigned i = 0, e = Name.size(); i != e; ++i) {
				73	assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
				74	CodePoint *= 10;
				75	CodePoint += Name[i] - '0';
				76	}
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	77	return convertCodePointToUTF8(Allocator, CodePoint);
				78	}
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	79
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	80	StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
				81	unsigned CodePoint = 0;
				82	for (unsigned i = 0, e = Name.size(); i != e; ++i) {
				83	CodePoint *= 16;
				84	const char C = Name[i];
				85	assert(isHTMLHexCharacterReferenceCharacter(C));
				86	CodePoint += llvm::hexDigitValue(C);
				87	}
				88	return convertCodePointToUTF8(Allocator, CodePoint);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	89	}
				90
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	91	void Lexer::skipLineStartingDecorations() {
				92	// This function should be called only for C comments
				93	assert(CommentState == LCS_InsideCComment);
				94
				95	if (BufferPtr == CommentEnd)
				96	return;
				97
				98	switch (*BufferPtr) {
				99	case ' ':
				100	case '\t':
				101	case '\f':
				102	case '\v': {
				103	const char *NewBufferPtr = BufferPtr;
				104	NewBufferPtr++;
				105	if (NewBufferPtr == CommentEnd)
				106	return;
				107
				108	char C = *NewBufferPtr;
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	109	while (isHorizontalWhitespace(C)) {
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	110	NewBufferPtr++;
				111	if (NewBufferPtr == CommentEnd)
				112	return;
				113	C = *NewBufferPtr;
				114	}
				115	if (C == '*')
				116	BufferPtr = NewBufferPtr + 1;
				117	break;
				118	}
				119	case '*':
				120	BufferPtr++;
				121	break;
				122	}
				123	}
				124
				125	namespace {
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	126	/// Returns pointer to the first newline character in the string.
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	127	const char findNewline(const char BufferPtr, const char *BufferEnd) {
				128	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	129	if (isVerticalWhitespace(*BufferPtr))
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	130	return BufferPtr;
				131	}
				132	return BufferEnd;
				133	}
				134
				135	const char skipNewline(const char BufferPtr, const char *BufferEnd) {
				136	if (BufferPtr == BufferEnd)
				137	return BufferPtr;
				138
				139	if (*BufferPtr == '\n')
				140	BufferPtr++;
				141	else {
				142	assert(*BufferPtr == '\r');
				143	BufferPtr++;
				144	if (BufferPtr != BufferEnd && *BufferPtr == '\n')
				145	BufferPtr++;
				146	}
				147	return BufferPtr;
				148	}
				149
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	150	const char skipNamedCharacterReference(const char BufferPtr,
				151	const char *BufferEnd) {
				152	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				153	if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
				154	return BufferPtr;
				155	}
				156	return BufferEnd;
				157	}
				158
				159	const char skipDecimalCharacterReference(const char BufferPtr,
				160	const char *BufferEnd) {
				161	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				162	if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
				163	return BufferPtr;
				164	}
				165	return BufferEnd;
				166	}
				167
				168	const char skipHexCharacterReference(const char BufferPtr,
Dmitri Gribenko	11f54e8	2013-08-23 17:48:41 +0000	[diff] [blame]	169	const char *BufferEnd) {
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	170	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				171	if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
				172	return BufferPtr;
				173	}
				174	return BufferEnd;
				175	}
				176
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	177	bool isHTMLIdentifierStartingCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	178	return isLetter(C);
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	179	}
				180
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	181	bool isHTMLIdentifierCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	182	return isAlphanumeric(C);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	183	}
				184
				185	const char skipHTMLIdentifier(const char BufferPtr, const char *BufferEnd) {
				186	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				187	if (!isHTMLIdentifierCharacter(*BufferPtr))
				188	return BufferPtr;
				189	}
				190	return BufferEnd;
				191	}
				192
				193	/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
				194	/// string allowed.
				195	///
				196	/// Returns pointer to closing quote.
				197	const char skipHTMLQuotedString(const char BufferPtr, const char *BufferEnd)
				198	{
				199	const char Quote = *BufferPtr;
				200	assert(Quote == '\"' \|\| Quote == '\'');
				201
				202	BufferPtr++;
				203	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				204	const char C = *BufferPtr;
				205	if (C == Quote && BufferPtr[-1] != '\\')
				206	return BufferPtr;
				207	}
				208	return BufferEnd;
				209	}
				210
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	211	const char skipWhitespace(const char BufferPtr, const char *BufferEnd) {
				212	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				213	if (!isWhitespace(*BufferPtr))
				214	return BufferPtr;
				215	}
				216	return BufferEnd;
				217	}
				218
Dmitri Gribenko	e4a3997	2012-07-18 23:01:58 +0000	[diff] [blame]	219	bool isWhitespace(const char BufferPtr, const char BufferEnd) {
				220	return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
				221	}
				222
Dmitri Gribenko	ad45ad6	2012-09-14 16:35:35 +0000	[diff] [blame]	223	bool isCommandNameStartCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	224	return isLetter(C);
Dmitri Gribenko	ad45ad6	2012-09-14 16:35:35 +0000	[diff] [blame]	225	}
				226
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	227	bool isCommandNameCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	228	return isAlphanumeric(C);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	229	}
				230
				231	const char skipCommandName(const char BufferPtr, const char *BufferEnd) {
				232	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				233	if (!isCommandNameCharacter(*BufferPtr))
				234	return BufferPtr;
				235	}
				236	return BufferEnd;
				237	}
				238
				239	/// Return the one past end pointer for BCPL comments.
				240	/// Handles newlines escaped with backslash or trigraph for backslahs.
				241	const char findBCPLCommentEnd(const char BufferPtr, const char *BufferEnd) {
				242	const char *CurPtr = BufferPtr;
				243	while (CurPtr != BufferEnd) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	244	while (!isVerticalWhitespace(*CurPtr)) {
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	245	CurPtr++;
				246	if (CurPtr == BufferEnd)
				247	return BufferEnd;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	248	}
				249	// We found a newline, check if it is escaped.
				250	const char *EscapePtr = CurPtr - 1;
				251	while(isHorizontalWhitespace(*EscapePtr))
				252	EscapePtr--;
				253
				254	if (*EscapePtr == '\\' \|\|
				255	(EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
				256	EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
				257	// We found an escaped newline.
				258	CurPtr = skipNewline(CurPtr, BufferEnd);
				259	} else
				260	return CurPtr; // Not an escaped newline.
				261	}
				262	return BufferEnd;
				263	}
				264
				265	/// Return the one past end pointer for C comments.
				266	/// Very dumb, does not handle escaped newlines or trigraphs.
				267	const char findCCommentEnd(const char BufferPtr, const char *BufferEnd) {
				268	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				269	if (BufferPtr == '') {
				270	assert(BufferPtr + 1 != BufferEnd);
				271	if (*(BufferPtr + 1) == '/')
				272	return BufferPtr;
				273	}
				274	}
				275	llvm_unreachable("buffer end hit before '*/' was seen");
				276	}
Fariborz Jahanian	6c7a166	2013-05-08 19:21:00 +0000	[diff] [blame]	277
Eugene Zelenko	0a4f3f4	2016-02-10 19:11:58 +0000	[diff] [blame]	278	} // end anonymous namespace
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	279
Alp Toker	3ffab05	2013-12-07 13:51:26 +0000	[diff] [blame]	280	void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
				281	tok::TokenKind Kind) {
				282	const unsigned TokLen = TokEnd - BufferPtr;
				283	Result.setLocation(getSourceLocation(BufferPtr));
				284	Result.setKind(Kind);
				285	Result.setLength(TokLen);
				286	#ifndef NDEBUG
				287	Result.TextPtr = "<UNSET>";
				288	Result.IntVal = 7;
				289	#endif
				290	BufferPtr = TokEnd;
				291	}
				292
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	293	void Lexer::lexCommentText(Token &T) {
				294	assert(CommentState == LCS_InsideBCPLComment \|\|
				295	CommentState == LCS_InsideCComment);
				296
				297	switch (State) {
				298	case LS_Normal:
				299	break;
				300	case LS_VerbatimBlockFirstLine:
				301	lexVerbatimBlockFirstLine(T);
				302	return;
				303	case LS_VerbatimBlockBody:
				304	lexVerbatimBlockBody(T);
				305	return;
Dmitri Gribenko	1669f70	2012-06-27 16:53:58 +0000	[diff] [blame]	306	case LS_VerbatimLineText:
				307	lexVerbatimLineText(T);
				308	return;
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	309	case LS_HTMLStartTag:
				310	lexHTMLStartTag(T);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	311	return;
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	312	case LS_HTMLEndTag:
				313	lexHTMLEndTag(T);
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	314	return;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	315	}
				316
				317	assert(State == LS_Normal);
				318
				319	const char *TokenPtr = BufferPtr;
				320	assert(TokenPtr < CommentEnd);
				321	while (TokenPtr != CommentEnd) {
				322	switch(*TokenPtr) {
				323	case '\\':
				324	case '@': {
Dmitri Gribenko	bcf7f4d	2013-03-04 23:06:15 +0000	[diff] [blame]	325	// Commands that start with a backslash and commands that start with
				326	// 'at' have equivalent semantics. But we keep information about the
				327	// exact syntax in AST for comments.
				328	tok::TokenKind CommandKind =
				329	(*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	330	TokenPtr++;
				331	if (TokenPtr == CommentEnd) {
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	332	formTextToken(T, TokenPtr);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	333	return;
				334	}
				335	char C = *TokenPtr;
				336	switch (C) {
				337	default:
				338	break;
				339
				340	case '\\': case '@': case '&': case '$':
				341	case '#': case '<': case '>': case '%':
				342	case '\"': case '.': case ':':
				343	// This is one of \\ \@ \& \$ etc escape sequences.
				344	TokenPtr++;
				345	if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
				346	// This is the \:: escape sequence.
				347	TokenPtr++;
				348	}
Dmitri Gribenko	60ddd8a	2012-06-27 16:30:35 +0000	[diff] [blame]	349	StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	350	formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenko	60ddd8a	2012-06-27 16:30:35 +0000	[diff] [blame]	351	T.setText(UnescapedText);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	352	return;
				353	}
				354
				355	// Don't make zero-length commands.
Dmitri Gribenko	ad45ad6	2012-09-14 16:35:35 +0000	[diff] [blame]	356	if (!isCommandNameStartCharacter(*TokenPtr)) {
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	357	formTextToken(T, TokenPtr);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	358	return;
				359	}
				360
				361	TokenPtr = skipCommandName(TokenPtr, CommentEnd);
				362	unsigned Length = TokenPtr - (BufferPtr + 1);
				363
				364	// Hardcoded support for lexing LaTeX formula commands
				365	// \f$ \f[ \f] \f{ \f} as a single command.
				366	if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
				367	C = *TokenPtr;
				368	if (C == '$' \|\| C == '[' \|\| C == ']' \|\| C == '{' \|\| C == '}') {
				369	TokenPtr++;
				370	Length++;
				371	}
				372	}
				373
Craig Topper	bf3e327	2014-08-30 16:55:52 +0000	[diff] [blame]	374	StringRef CommandName(BufferPtr + 1, Length);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	375
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	376	const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
				377	if (!Info) {
Fariborz Jahanian	ebb262f	2013-05-08 20:29:57 +0000	[diff] [blame]	378	if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
Fariborz Jahanian	6c7a166	2013-05-08 19:21:00 +0000	[diff] [blame]	379	StringRef CorrectedName = Info->Name;
Benjamin Kramer	2dece57	2013-12-01 15:09:32 +0000	[diff] [blame]	380	SourceLocation Loc = getSourceLocation(BufferPtr);
Erik Verbruggen	4908237	2016-10-25 10:06:11 +0000	[diff] [blame]	381	SourceLocation EndLoc = getSourceLocation(TokenPtr);
				382	SourceRange FullRange = SourceRange(Loc, EndLoc);
				383	SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
Benjamin Kramer	2dece57	2013-12-01 15:09:32 +0000	[diff] [blame]	384	Diag(Loc, diag::warn_correct_comment_command_name)
Erik Verbruggen	4908237	2016-10-25 10:06:11 +0000	[diff] [blame]	385	<< FullRange << CommandName << CorrectedName
Fariborz Jahanian	6c7a166	2013-05-08 19:21:00 +0000	[diff] [blame]	386	<< FixItHint::CreateReplacement(CommandRange, CorrectedName);
				387	} else {
Benjamin Kramer	2dece57	2013-12-01 15:09:32 +0000	[diff] [blame]	388	formTokenWithChars(T, TokenPtr, tok::unknown_command);
				389	T.setUnknownCommandName(CommandName);
Erik Verbruggen	4908237	2016-10-25 10:06:11 +0000	[diff] [blame]	390	Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
				391	<< SourceRange(T.getLocation(), T.getEndLocation());
Fariborz Jahanian	6c7a166	2013-05-08 19:21:00 +0000	[diff] [blame]	392	return;
				393	}
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	394	}
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	395	if (Info->IsVerbatimBlockCommand) {
				396	setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
				397	return;
				398	}
				399	if (Info->IsVerbatimLineCommand) {
				400	setupAndLexVerbatimLine(T, TokenPtr, Info);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	401	return;
				402	}
Dmitri Gribenko	bcf7f4d	2013-03-04 23:06:15 +0000	[diff] [blame]	403	formTokenWithChars(T, TokenPtr, CommandKind);
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	404	T.setCommandID(Info->getID());
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	405	return;
				406	}
				407
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	408	case '&':
				409	lexHTMLCharacterReference(T);
				410	return;
				411
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	412	case '<': {
				413	TokenPtr++;
				414	if (TokenPtr == CommentEnd) {
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	415	formTextToken(T, TokenPtr);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	416	return;
				417	}
				418	const char C = *TokenPtr;
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	419	if (isHTMLIdentifierStartingCharacter(C))
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	420	setupAndLexHTMLStartTag(T);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	421	else if (C == '/')
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	422	setupAndLexHTMLEndTag(T);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	423	else
				424	formTextToken(T, TokenPtr);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	425	return;
				426	}
				427
				428	case '\n':
				429	case '\r':
				430	TokenPtr = skipNewline(TokenPtr, CommentEnd);
				431	formTokenWithChars(T, TokenPtr, tok::newline);
				432
				433	if (CommentState == LCS_InsideCComment)
				434	skipLineStartingDecorations();
				435	return;
				436
				437	default: {
Dmitri Gribenko	10af67a	2012-12-30 19:45:46 +0000	[diff] [blame]	438	size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
				439	find_first_of("\n\r\\@&<");
				440	if (End != StringRef::npos)
				441	TokenPtr += End;
				442	else
				443	TokenPtr = CommentEnd;
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	444	formTextToken(T, TokenPtr);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	445	return;
				446	}
				447	}
				448	}
				449	}
				450
				451	void Lexer::setupAndLexVerbatimBlock(Token &T,
				452	const char *TextBegin,
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	453	char Marker, const CommandInfo *Info) {
				454	assert(Info->IsVerbatimBlockCommand);
				455
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	456	VerbatimBlockEndCommandName.clear();
				457	VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	458	VerbatimBlockEndCommandName.append(Info->EndCommandName);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	459
				460	formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	461	T.setVerbatimBlockID(Info->getID());
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	462
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	463	// If there is a newline following the verbatim opening command, skip the
				464	// newline so that we don't create an tok::verbatim_block_line with empty
				465	// text content.
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	466	if (BufferPtr != CommentEnd &&
				467	isVerticalWhitespace(*BufferPtr)) {
				468	BufferPtr = skipNewline(BufferPtr, CommentEnd);
				469	State = LS_VerbatimBlockBody;
				470	return;
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	471	}
				472
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	473	State = LS_VerbatimBlockFirstLine;
				474	}
				475
				476	void Lexer::lexVerbatimBlockFirstLine(Token &T) {
Dmitri Gribenko	e4a3997	2012-07-18 23:01:58 +0000	[diff] [blame]	477	again:
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	478	assert(BufferPtr < CommentEnd);
				479
				480	// FIXME: It would be better to scan the text once, finding either the block
				481	// end command or newline.
				482	//
				483	// Extract current line.
				484	const char *Newline = findNewline(BufferPtr, CommentEnd);
				485	StringRef Line(BufferPtr, Newline - BufferPtr);
				486
				487	// Look for end command in current line.
				488	size_t Pos = Line.find(VerbatimBlockEndCommandName);
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	489	const char *TextEnd;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	490	const char *NextLine;
				491	if (Pos == StringRef::npos) {
				492	// Current line is completely verbatim.
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	493	TextEnd = Newline;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	494	NextLine = skipNewline(Newline, CommentEnd);
				495	} else if (Pos == 0) {
				496	// Current line contains just an end command.
				497	const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
Dmitri Gribenko	60ddd8a	2012-06-27 16:30:35 +0000	[diff] [blame]	498	StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	499	formTokenWithChars(T, End, tok::verbatim_block_end);
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	500	T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	501	State = LS_Normal;
				502	return;
				503	} else {
				504	// There is some text, followed by end command. Extract text first.
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	505	TextEnd = BufferPtr + Pos;
				506	NextLine = TextEnd;
Dmitri Gribenko	e4a3997	2012-07-18 23:01:58 +0000	[diff] [blame]	507	// If there is only whitespace before end command, skip whitespace.
				508	if (isWhitespace(BufferPtr, TextEnd)) {
				509	BufferPtr = TextEnd;
				510	goto again;
				511	}
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	512	}
				513
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	514	StringRef Text(BufferPtr, TextEnd - BufferPtr);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	515	formTokenWithChars(T, NextLine, tok::verbatim_block_line);
Dmitri Gribenko	60ddd8a	2012-06-27 16:30:35 +0000	[diff] [blame]	516	T.setVerbatimBlockText(Text);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	517
				518	State = LS_VerbatimBlockBody;
				519	}
				520
				521	void Lexer::lexVerbatimBlockBody(Token &T) {
				522	assert(State == LS_VerbatimBlockBody);
				523
				524	if (CommentState == LCS_InsideCComment)
				525	skipLineStartingDecorations();
				526
Dmitri Gribenko	8b72062	2015-04-15 23:45:43 +0000	[diff] [blame]	527	if (BufferPtr == CommentEnd) {
				528	formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
				529	T.setVerbatimBlockText("");
				530	return;
				531	}
				532
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	533	lexVerbatimBlockFirstLine(T);
				534	}
				535
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	536	void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
				537	const CommandInfo *Info) {
				538	assert(Info->IsVerbatimLineCommand);
Dmitri Gribenko	1669f70	2012-06-27 16:53:58 +0000	[diff] [blame]	539	formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	540	T.setVerbatimLineID(Info->getID());
Dmitri Gribenko	1669f70	2012-06-27 16:53:58 +0000	[diff] [blame]	541
				542	State = LS_VerbatimLineText;
				543	}
				544
				545	void Lexer::lexVerbatimLineText(Token &T) {
				546	assert(State == LS_VerbatimLineText);
				547
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	548	// Extract current line.
				549	const char *Newline = findNewline(BufferPtr, CommentEnd);
Craig Topper	bf3e327	2014-08-30 16:55:52 +0000	[diff] [blame]	550	StringRef Text(BufferPtr, Newline - BufferPtr);
Dmitri Gribenko	1669f70	2012-06-27 16:53:58 +0000	[diff] [blame]	551	formTokenWithChars(T, Newline, tok::verbatim_line_text);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	552	T.setVerbatimLineText(Text);
Dmitri Gribenko	1669f70	2012-06-27 16:53:58 +0000	[diff] [blame]	553
				554	State = LS_Normal;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	555	}
				556
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	557	void Lexer::lexHTMLCharacterReference(Token &T) {
				558	const char *TokenPtr = BufferPtr;
				559	assert(*TokenPtr == '&');
				560	TokenPtr++;
				561	if (TokenPtr == CommentEnd) {
				562	formTextToken(T, TokenPtr);
				563	return;
				564	}
				565	const char *NamePtr;
				566	bool isNamed = false;
				567	bool isDecimal = false;
				568	char C = *TokenPtr;
				569	if (isHTMLNamedCharacterReferenceCharacter(C)) {
				570	NamePtr = TokenPtr;
				571	TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
				572	isNamed = true;
				573	} else if (C == '#') {
				574	TokenPtr++;
				575	if (TokenPtr == CommentEnd) {
				576	formTextToken(T, TokenPtr);
				577	return;
				578	}
				579	C = *TokenPtr;
				580	if (isHTMLDecimalCharacterReferenceCharacter(C)) {
				581	NamePtr = TokenPtr;
				582	TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
				583	isDecimal = true;
				584	} else if (C == 'x' \|\| C == 'X') {
				585	TokenPtr++;
				586	NamePtr = TokenPtr;
				587	TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
				588	} else {
				589	formTextToken(T, TokenPtr);
				590	return;
				591	}
				592	} else {
				593	formTextToken(T, TokenPtr);
				594	return;
				595	}
				596	if (NamePtr == TokenPtr \|\| TokenPtr == CommentEnd \|\|
				597	*TokenPtr != ';') {
				598	formTextToken(T, TokenPtr);
				599	return;
				600	}
				601	StringRef Name(NamePtr, TokenPtr - NamePtr);
				602	TokenPtr++; // Skip semicolon.
				603	StringRef Resolved;
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	604	if (isNamed)
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	605	Resolved = resolveHTMLNamedCharacterReference(Name);
				606	else if (isDecimal)
				607	Resolved = resolveHTMLDecimalCharacterReference(Name);
				608	else
				609	Resolved = resolveHTMLHexCharacterReference(Name);
				610
				611	if (Resolved.empty()) {
				612	formTextToken(T, TokenPtr);
				613	return;
				614	}
				615	formTokenWithChars(T, TokenPtr, tok::text);
				616	T.setText(Resolved);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	617	}
				618
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	619	void Lexer::setupAndLexHTMLStartTag(Token &T) {
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	620	assert(BufferPtr[0] == '<' &&
				621	isHTMLIdentifierStartingCharacter(BufferPtr[1]));
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	622	const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
Dmitri Gribenko	60ddd8a	2012-06-27 16:30:35 +0000	[diff] [blame]	623	StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
Dmitri Gribenko	107618a	2012-08-22 22:56:08 +0000	[diff] [blame]	624	if (!isHTMLTagName(Name)) {
				625	formTextToken(T, TagNameEnd);
				626	return;
				627	}
				628
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	629	formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
				630	T.setHTMLTagStartName(Name);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	631
				632	BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
				633
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	634	const char C = *BufferPtr;
				635	if (BufferPtr != CommentEnd &&
Dmitri Gribenko	f26054f	2012-07-11 21:38:39 +0000	[diff] [blame]	636	(C == '>' \|\| C == '/' \|\| isHTMLIdentifierStartingCharacter(C)))
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	637	State = LS_HTMLStartTag;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	638	}
				639
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	640	void Lexer::lexHTMLStartTag(Token &T) {
				641	assert(State == LS_HTMLStartTag);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	642
				643	const char *TokenPtr = BufferPtr;
				644	char C = *TokenPtr;
				645	if (isHTMLIdentifierCharacter(C)) {
				646	TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
Dmitri Gribenko	60ddd8a	2012-06-27 16:30:35 +0000	[diff] [blame]	647	StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	648	formTokenWithChars(T, TokenPtr, tok::html_ident);
Dmitri Gribenko	60ddd8a	2012-06-27 16:30:35 +0000	[diff] [blame]	649	T.setHTMLIdent(Ident);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	650	} else {
				651	switch (C) {
				652	case '=':
				653	TokenPtr++;
				654	formTokenWithChars(T, TokenPtr, tok::html_equals);
				655	break;
				656	case '\"':
				657	case '\'': {
				658	const char *OpenQuote = TokenPtr;
				659	TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
				660	const char *ClosingQuote = TokenPtr;
				661	if (TokenPtr != CommentEnd) // Skip closing quote.
				662	TokenPtr++;
				663	formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
				664	T.setHTMLQuotedString(StringRef(OpenQuote + 1,
				665	ClosingQuote - (OpenQuote + 1)));
				666	break;
				667	}
				668	case '>':
				669	TokenPtr++;
				670	formTokenWithChars(T, TokenPtr, tok::html_greater);
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	671	State = LS_Normal;
				672	return;
Dmitri Gribenko	f26054f	2012-07-11 21:38:39 +0000	[diff] [blame]	673	case '/':
				674	TokenPtr++;
				675	if (TokenPtr != CommentEnd && *TokenPtr == '>') {
				676	TokenPtr++;
				677	formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	678	} else
				679	formTextToken(T, TokenPtr);
				680
Dmitri Gribenko	f26054f	2012-07-11 21:38:39 +0000	[diff] [blame]	681	State = LS_Normal;
				682	return;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	683	}
				684	}
				685
				686	// Now look ahead and return to normal state if we don't see any HTML tokens
				687	// ahead.
				688	BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
				689	if (BufferPtr == CommentEnd) {
				690	State = LS_Normal;
				691	return;
				692	}
				693
				694	C = *BufferPtr;
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	695	if (!isHTMLIdentifierStartingCharacter(C) &&
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	696	C != '=' && C != '\"' && C != '\'' && C != '>') {
				697	State = LS_Normal;
				698	return;
				699	}
				700	}
				701
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	702	void Lexer::setupAndLexHTMLEndTag(Token &T) {
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	703	assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
				704
				705	const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
				706	const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
Dmitri Gribenko	107618a	2012-08-22 22:56:08 +0000	[diff] [blame]	707	StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
				708	if (!isHTMLTagName(Name)) {
				709	formTextToken(T, TagNameEnd);
				710	return;
				711	}
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	712
				713	const char *End = skipWhitespace(TagNameEnd, CommentEnd);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	714
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	715	formTokenWithChars(T, End, tok::html_end_tag);
Dmitri Gribenko	107618a	2012-08-22 22:56:08 +0000	[diff] [blame]	716	T.setHTMLTagEndName(Name);
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	717
				718	if (BufferPtr != CommentEnd && *BufferPtr == '>')
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	719	State = LS_HTMLEndTag;
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	720	}
				721
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	722	void Lexer::lexHTMLEndTag(Token &T) {
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	723	assert(BufferPtr != CommentEnd && *BufferPtr == '>');
				724
				725	formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
				726	State = LS_Normal;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	727	}
				728
Fariborz Jahanian	5b63707	2013-05-03 23:15:20 +0000	[diff] [blame]	729	Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
				730	const CommandTraits &Traits,
Dmitri Gribenko	6bab911	2012-08-31 10:35:30 +0000	[diff] [blame]	731	SourceLocation FileLoc,
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	732	const char BufferStart, const char BufferEnd):
Fariborz Jahanian	5b63707	2013-05-03 23:15:20 +0000	[diff] [blame]	733	Allocator(Allocator), Diags(Diags), Traits(Traits),
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	734	BufferStart(BufferStart), BufferEnd(BufferEnd),
Dmitri Gribenko	6bab911	2012-08-31 10:35:30 +0000	[diff] [blame]	735	FileLoc(FileLoc), BufferPtr(BufferStart),
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	736	CommentState(LCS_BeforeComment), State(LS_Normal) {
				737	}
				738
				739	void Lexer::lex(Token &T) {
				740	again:
				741	switch (CommentState) {
				742	case LCS_BeforeComment:
				743	if (BufferPtr == BufferEnd) {
				744	formTokenWithChars(T, BufferPtr, tok::eof);
				745	return;
				746	}
				747
				748	assert(*BufferPtr == '/');
				749	BufferPtr++; // Skip first slash.
				750	switch(*BufferPtr) {
				751	case '/': { // BCPL comment.
				752	BufferPtr++; // Skip second slash.
				753
				754	if (BufferPtr != BufferEnd) {
				755	// Skip Doxygen magic marker, if it is present.
				756	// It might be missing because of a typo //< or /*<, or because we
				757	// merged this non-Doxygen comment into a bunch of Doxygen comments
				758	// around it: /** ... / / ... / /* ... */
				759	const char C = *BufferPtr;
				760	if (C == '/' \|\| C == '!')
				761	BufferPtr++;
				762	}
				763
				764	// Skip less-than symbol that marks trailing comments.
				765	// Skip it even if the comment is not a Doxygen one, because //< and /*<
				766	// are frequent typos.
				767	if (BufferPtr != BufferEnd && *BufferPtr == '<')
				768	BufferPtr++;
				769
				770	CommentState = LCS_InsideBCPLComment;
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	771	if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
				772	State = LS_Normal;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	773	CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
				774	goto again;
				775	}
				776	case '*': { // C comment.
				777	BufferPtr++; // Skip star.
				778
				779	// Skip Doxygen magic marker.
				780	const char C = *BufferPtr;
				781	if ((C == '' && (BufferPtr + 1) != '/') \|\| C == '!')
				782	BufferPtr++;
				783
				784	// Skip less-than symbol that marks trailing comments.
				785	if (BufferPtr != BufferEnd && *BufferPtr == '<')
				786	BufferPtr++;
				787
				788	CommentState = LCS_InsideCComment;
				789	State = LS_Normal;
				790	CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
				791	goto again;
				792	}
				793	default:
				794	llvm_unreachable("second character of comment should be '/' or '*'");
				795	}
				796
				797	case LCS_BetweenComments: {
				798	// Consecutive comments are extracted only if there is only whitespace
				799	// between them. So we can search for the start of the next comment.
				800	const char *EndWhitespace = BufferPtr;
				801	while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
				802	EndWhitespace++;
				803
				804	// Turn any whitespace between comments (and there is only whitespace
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	805	// between them -- guaranteed by comment extraction) into a newline. We
				806	// have two newlines between C comments in total (first one was synthesized
				807	// after a comment).
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	808	formTokenWithChars(T, EndWhitespace, tok::newline);
				809
				810	CommentState = LCS_BeforeComment;
				811	break;
				812	}
				813
				814	case LCS_InsideBCPLComment:
				815	case LCS_InsideCComment:
				816	if (BufferPtr != CommentEnd) {
				817	lexCommentText(T);
				818	break;
				819	} else {
				820	// Skip C comment closing sequence.
				821	if (CommentState == LCS_InsideCComment) {
				822	assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
				823	BufferPtr += 2;
				824	assert(BufferPtr <= BufferEnd);
				825
				826	// Synthenize newline just after the C comment, regardless if there is
				827	// actually a newline.
				828	formTokenWithChars(T, BufferPtr, tok::newline);
				829
				830	CommentState = LCS_BetweenComments;
				831	break;
				832	} else {
				833	// Don't synthesized a newline after BCPL comment.
				834	CommentState = LCS_BetweenComments;
				835	goto again;
				836	}
				837	}
				838	}
				839	}
				840
				841	StringRef Lexer::getSpelling(const Token &Tok,
				842	const SourceManager &SourceMgr,
				843	bool *Invalid) const {
				844	SourceLocation Loc = Tok.getLocation();
				845	std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
				846
				847	bool InvalidTemp = false;
				848	StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
				849	if (InvalidTemp) {
				850	*Invalid = true;
				851	return StringRef();
				852	}
				853
				854	const char *Begin = File.data() + LocInfo.second;
				855	return StringRef(Begin, Tok.getLength());
				856	}
				857
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	858	} // end namespace comments
				859	} // end namespace clang