Blame - clang/lib/AST/CommentLexer.cpp - toolchain/llvm-project

blob: c1ea3eab075e272f17055c2d3aea40c970a69c39 [file] [log] [blame]

Eugene Zelenko	0a4f3f4	2016-02-10 19:11:58 +0000	[diff] [blame]	1	//===--- CommentLexer.cpp -------------------------------------------------===//
				2	//
Chandler Carruth	2946cd7	2019-01-19 08:50:56 +0000	[diff] [blame]	3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				4	// See https://llvm.org/LICENSE.txt for license information.
				5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Eugene Zelenko	0a4f3f4	2016-02-10 19:11:58 +0000	[diff] [blame]	6	//
				7	//===----------------------------------------------------------------------===//
				8
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	9	#include "clang/AST/CommentLexer.h"
Dmitri Gribenko	ca7f80a	2012-08-09 00:03:17 +0000	[diff] [blame]	10	#include "clang/AST/CommentCommandTraits.h"
Fariborz Jahanian	6738e43	2013-05-04 00:47:28 +0000	[diff] [blame]	11	#include "clang/AST/CommentDiagnostic.h"
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	12	#include "clang/Basic/CharInfo.h"
Dmitri Gribenko	b2e5482	2013-01-19 22:06:05 +0000	[diff] [blame]	13	#include "llvm/ADT/StringExtras.h"
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	14	#include "llvm/ADT/StringSwitch.h"
Dmitri Gribenko	9feeef4	2013-01-30 12:06:08 +0000	[diff] [blame]	15	#include "llvm/Support/ConvertUTF.h"
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	16	#include "llvm/Support/ErrorHandling.h"
				17
				18	namespace clang {
				19	namespace comments {
				20
				21	void Token::dump(const Lexer &L, const SourceManager &SM) const {
				22	llvm::errs() << "comments::Token Kind=" << Kind << " ";
Stephen Kelly	3124ce7	2018-08-15 20:32:06 +0000	[diff] [blame]	23	Loc.print(llvm::errs(), SM);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	24	llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
				25	}
				26
Dmitri Gribenko	74f4d02	2013-02-10 11:54:22 +0000	[diff] [blame]	27	static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	28	return isLetter(C);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	29	}
				30
Dmitri Gribenko	74f4d02	2013-02-10 11:54:22 +0000	[diff] [blame]	31	static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	32	return isDigit(C);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	33	}
				34
Dmitri Gribenko	74f4d02	2013-02-10 11:54:22 +0000	[diff] [blame]	35	static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	36	return isHexDigit(C);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	37	}
Dmitri Gribenko	107618a	2012-08-22 22:56:08 +0000	[diff] [blame]	38
Dmitri Gribenko	74f4d02	2013-02-10 11:54:22 +0000	[diff] [blame]	39	static inline StringRef convertCodePointToUTF8(
				40	llvm::BumpPtrAllocator &Allocator,
				41	unsigned CodePoint) {
Fariborz Jahanian	7b3ae19	2013-01-29 23:42:26 +0000	[diff] [blame]	42	char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
				43	char *ResolvedPtr = Resolved;
Dmitri Gribenko	9feeef4	2013-01-30 12:06:08 +0000	[diff] [blame]	44	if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
Fariborz Jahanian	7b3ae19	2013-01-29 23:42:26 +0000	[diff] [blame]	45	return StringRef(Resolved, ResolvedPtr - Resolved);
				46	else
				47	return StringRef();
				48	}
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	49
Dmitri Gribenko	74f4d02	2013-02-10 11:54:22 +0000	[diff] [blame]	50	namespace {
				51
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	52	#include "clang/AST/CommentHTMLTags.inc"
				53	#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
				54
Eugene Zelenko	0a4f3f4	2016-02-10 19:11:58 +0000	[diff] [blame]	55	} // end anonymous namespace
Fariborz Jahanian	7b3ae19	2013-01-29 23:42:26 +0000	[diff] [blame]	56
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	57	StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	58	// Fast path, first check a few most widely used named character references.
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	59	return llvm::StringSwitch<StringRef>(Name)
				60	.Case("amp", "&")
				61	.Case("lt", "<")
				62	.Case("gt", ">")
				63	.Case("quot", "\"")
				64	.Case("apos", "\'")
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	65	// Slow path.
				66	.Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
Fariborz Jahanian	7b3ae19	2013-01-29 23:42:26 +0000	[diff] [blame]	67	}
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	68
				69	StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
				70	unsigned CodePoint = 0;
				71	for (unsigned i = 0, e = Name.size(); i != e; ++i) {
				72	assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
				73	CodePoint *= 10;
				74	CodePoint += Name[i] - '0';
				75	}
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	76	return convertCodePointToUTF8(Allocator, CodePoint);
				77	}
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	78
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	79	StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
				80	unsigned CodePoint = 0;
				81	for (unsigned i = 0, e = Name.size(); i != e; ++i) {
				82	CodePoint *= 16;
				83	const char C = Name[i];
				84	assert(isHTMLHexCharacterReferenceCharacter(C));
				85	CodePoint += llvm::hexDigitValue(C);
				86	}
				87	return convertCodePointToUTF8(Allocator, CodePoint);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	88	}
				89
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	90	void Lexer::skipLineStartingDecorations() {
				91	// This function should be called only for C comments
				92	assert(CommentState == LCS_InsideCComment);
				93
				94	if (BufferPtr == CommentEnd)
				95	return;
				96
				97	switch (*BufferPtr) {
				98	case ' ':
				99	case '\t':
				100	case '\f':
				101	case '\v': {
				102	const char *NewBufferPtr = BufferPtr;
				103	NewBufferPtr++;
				104	if (NewBufferPtr == CommentEnd)
				105	return;
				106
				107	char C = *NewBufferPtr;
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	108	while (isHorizontalWhitespace(C)) {
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	109	NewBufferPtr++;
				110	if (NewBufferPtr == CommentEnd)
				111	return;
				112	C = *NewBufferPtr;
				113	}
				114	if (C == '*')
				115	BufferPtr = NewBufferPtr + 1;
				116	break;
				117	}
				118	case '*':
				119	BufferPtr++;
				120	break;
				121	}
				122	}
				123
				124	namespace {
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	125	/// Returns pointer to the first newline character in the string.
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	126	const char findNewline(const char BufferPtr, const char *BufferEnd) {
				127	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	128	if (isVerticalWhitespace(*BufferPtr))
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	129	return BufferPtr;
				130	}
				131	return BufferEnd;
				132	}
				133
				134	const char skipNewline(const char BufferPtr, const char *BufferEnd) {
				135	if (BufferPtr == BufferEnd)
				136	return BufferPtr;
				137
				138	if (*BufferPtr == '\n')
				139	BufferPtr++;
				140	else {
				141	assert(*BufferPtr == '\r');
				142	BufferPtr++;
				143	if (BufferPtr != BufferEnd && *BufferPtr == '\n')
				144	BufferPtr++;
				145	}
				146	return BufferPtr;
				147	}
				148
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	149	const char skipNamedCharacterReference(const char BufferPtr,
				150	const char *BufferEnd) {
				151	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				152	if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
				153	return BufferPtr;
				154	}
				155	return BufferEnd;
				156	}
				157
				158	const char skipDecimalCharacterReference(const char BufferPtr,
				159	const char *BufferEnd) {
				160	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				161	if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
				162	return BufferPtr;
				163	}
				164	return BufferEnd;
				165	}
				166
				167	const char skipHexCharacterReference(const char BufferPtr,
Dmitri Gribenko	11f54e8	2013-08-23 17:48:41 +0000	[diff] [blame]	168	const char *BufferEnd) {
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	169	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				170	if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
				171	return BufferPtr;
				172	}
				173	return BufferEnd;
				174	}
				175
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	176	bool isHTMLIdentifierStartingCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	177	return isLetter(C);
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	178	}
				179
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	180	bool isHTMLIdentifierCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	181	return isAlphanumeric(C);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	182	}
				183
				184	const char skipHTMLIdentifier(const char BufferPtr, const char *BufferEnd) {
				185	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				186	if (!isHTMLIdentifierCharacter(*BufferPtr))
				187	return BufferPtr;
				188	}
				189	return BufferEnd;
				190	}
				191
				192	/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
				193	/// string allowed.
				194	///
				195	/// Returns pointer to closing quote.
				196	const char skipHTMLQuotedString(const char BufferPtr, const char *BufferEnd)
				197	{
				198	const char Quote = *BufferPtr;
				199	assert(Quote == '\"' \|\| Quote == '\'');
				200
				201	BufferPtr++;
				202	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				203	const char C = *BufferPtr;
				204	if (C == Quote && BufferPtr[-1] != '\\')
				205	return BufferPtr;
				206	}
				207	return BufferEnd;
				208	}
				209
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	210	const char skipWhitespace(const char BufferPtr, const char *BufferEnd) {
				211	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				212	if (!isWhitespace(*BufferPtr))
				213	return BufferPtr;
				214	}
				215	return BufferEnd;
				216	}
				217
Dmitri Gribenko	e4a3997	2012-07-18 23:01:58 +0000	[diff] [blame]	218	bool isWhitespace(const char BufferPtr, const char BufferEnd) {
				219	return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
				220	}
				221
Dmitri Gribenko	ad45ad6	2012-09-14 16:35:35 +0000	[diff] [blame]	222	bool isCommandNameStartCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	223	return isLetter(C);
Dmitri Gribenko	ad45ad6	2012-09-14 16:35:35 +0000	[diff] [blame]	224	}
				225
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	226	bool isCommandNameCharacter(char C) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	227	return isAlphanumeric(C);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	228	}
				229
				230	const char skipCommandName(const char BufferPtr, const char *BufferEnd) {
				231	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				232	if (!isCommandNameCharacter(*BufferPtr))
				233	return BufferPtr;
				234	}
				235	return BufferEnd;
				236	}
				237
				238	/// Return the one past end pointer for BCPL comments.
				239	/// Handles newlines escaped with backslash or trigraph for backslahs.
				240	const char findBCPLCommentEnd(const char BufferPtr, const char *BufferEnd) {
				241	const char *CurPtr = BufferPtr;
				242	while (CurPtr != BufferEnd) {
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	243	while (!isVerticalWhitespace(*CurPtr)) {
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	244	CurPtr++;
				245	if (CurPtr == BufferEnd)
				246	return BufferEnd;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	247	}
				248	// We found a newline, check if it is escaped.
				249	const char *EscapePtr = CurPtr - 1;
				250	while(isHorizontalWhitespace(*EscapePtr))
				251	EscapePtr--;
				252
				253	if (*EscapePtr == '\\' \|\|
				254	(EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
				255	EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
				256	// We found an escaped newline.
				257	CurPtr = skipNewline(CurPtr, BufferEnd);
				258	} else
				259	return CurPtr; // Not an escaped newline.
				260	}
				261	return BufferEnd;
				262	}
				263
				264	/// Return the one past end pointer for C comments.
				265	/// Very dumb, does not handle escaped newlines or trigraphs.
				266	const char findCCommentEnd(const char BufferPtr, const char *BufferEnd) {
				267	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				268	if (BufferPtr == '') {
				269	assert(BufferPtr + 1 != BufferEnd);
				270	if (*(BufferPtr + 1) == '/')
				271	return BufferPtr;
				272	}
				273	}
				274	llvm_unreachable("buffer end hit before '*/' was seen");
				275	}
Fangrui Song	6907ce2	2018-07-30 19:24:48 +0000	[diff] [blame]	276
Eugene Zelenko	0a4f3f4	2016-02-10 19:11:58 +0000	[diff] [blame]	277	} // end anonymous namespace
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	278
Alp Toker	3ffab05	2013-12-07 13:51:26 +0000	[diff] [blame]	279	void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
				280	tok::TokenKind Kind) {
				281	const unsigned TokLen = TokEnd - BufferPtr;
				282	Result.setLocation(getSourceLocation(BufferPtr));
				283	Result.setKind(Kind);
				284	Result.setLength(TokLen);
				285	#ifndef NDEBUG
				286	Result.TextPtr = "<UNSET>";
				287	Result.IntVal = 7;
				288	#endif
				289	BufferPtr = TokEnd;
				290	}
				291
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	292	void Lexer::lexCommentText(Token &T) {
				293	assert(CommentState == LCS_InsideBCPLComment \|\|
				294	CommentState == LCS_InsideCComment);
				295
Ilya Biryukov	1ff7c32	2018-05-16 12:30:09 +0000	[diff] [blame]	296	// Handles lexing non-command text, i.e. text and newline.
				297	auto HandleNonCommandToken = [&]() -> void {
				298	assert(State == LS_Normal);
				299
				300	const char *TokenPtr = BufferPtr;
				301	assert(TokenPtr < CommentEnd);
				302	switch (*TokenPtr) {
				303	case '\n':
				304	case '\r':
				305	TokenPtr = skipNewline(TokenPtr, CommentEnd);
				306	formTokenWithChars(T, TokenPtr, tok::newline);
				307
				308	if (CommentState == LCS_InsideCComment)
				309	skipLineStartingDecorations();
				310	return;
				311
				312	default: {
				313	StringRef TokStartSymbols = ParseCommands ? "\n\r\\@&<" : "\n\r";
				314	size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr)
				315	.find_first_of(TokStartSymbols);
				316	if (End != StringRef::npos)
				317	TokenPtr += End;
				318	else
				319	TokenPtr = CommentEnd;
				320	formTextToken(T, TokenPtr);
				321	return;
				322	}
				323	}
				324	};
				325
				326	if (!ParseCommands)
				327	return HandleNonCommandToken();
				328
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	329	switch (State) {
				330	case LS_Normal:
				331	break;
				332	case LS_VerbatimBlockFirstLine:
				333	lexVerbatimBlockFirstLine(T);
				334	return;
				335	case LS_VerbatimBlockBody:
				336	lexVerbatimBlockBody(T);
				337	return;
Dmitri Gribenko	1669f70	2012-06-27 16:53:58 +0000	[diff] [blame]	338	case LS_VerbatimLineText:
				339	lexVerbatimLineText(T);
				340	return;
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	341	case LS_HTMLStartTag:
				342	lexHTMLStartTag(T);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	343	return;
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	344	case LS_HTMLEndTag:
				345	lexHTMLEndTag(T);
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	346	return;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	347	}
				348
				349	assert(State == LS_Normal);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	350	const char *TokenPtr = BufferPtr;
				351	assert(TokenPtr < CommentEnd);
Ilya Biryukov	1ff7c32	2018-05-16 12:30:09 +0000	[diff] [blame]	352	switch(*TokenPtr) {
				353	case '\\':
				354	case '@': {
				355	// Commands that start with a backslash and commands that start with
				356	// 'at' have equivalent semantics. But we keep information about the
				357	// exact syntax in AST for comments.
				358	tok::TokenKind CommandKind =
				359	(*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
				360	TokenPtr++;
				361	if (TokenPtr == CommentEnd) {
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	362	formTextToken(T, TokenPtr);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	363	return;
				364	}
Ilya Biryukov	1ff7c32	2018-05-16 12:30:09 +0000	[diff] [blame]	365	char C = *TokenPtr;
				366	switch (C) {
				367	default:
				368	break;
				369
				370	case '\\': case '@': case '&': case '$':
				371	case '#': case '<': case '>': case '%':
				372	case '\"': case '.': case ':':
				373	// This is one of \\ \@ \& \$ etc escape sequences.
				374	TokenPtr++;
				375	if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
				376	// This is the \:: escape sequence.
				377	TokenPtr++;
				378	}
				379	StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
				380	formTokenWithChars(T, TokenPtr, tok::text);
				381	T.setText(UnescapedText);
				382	return;
				383	}
				384
				385	// Don't make zero-length commands.
				386	if (!isCommandNameStartCharacter(*TokenPtr)) {
				387	formTextToken(T, TokenPtr);
				388	return;
				389	}
				390
				391	TokenPtr = skipCommandName(TokenPtr, CommentEnd);
				392	unsigned Length = TokenPtr - (BufferPtr + 1);
				393
				394	// Hardcoded support for lexing LaTeX formula commands
				395	// \f$ \f[ \f] \f{ \f} as a single command.
				396	if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
				397	C = *TokenPtr;
				398	if (C == '$' \|\| C == '[' \|\| C == ']' \|\| C == '{' \|\| C == '}') {
				399	TokenPtr++;
				400	Length++;
				401	}
				402	}
				403
				404	StringRef CommandName(BufferPtr + 1, Length);
				405
				406	const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
				407	if (!Info) {
				408	if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
				409	StringRef CorrectedName = Info->Name;
				410	SourceLocation Loc = getSourceLocation(BufferPtr);
				411	SourceLocation EndLoc = getSourceLocation(TokenPtr);
				412	SourceRange FullRange = SourceRange(Loc, EndLoc);
				413	SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
				414	Diag(Loc, diag::warn_correct_comment_command_name)
				415	<< FullRange << CommandName << CorrectedName
				416	<< FixItHint::CreateReplacement(CommandRange, CorrectedName);
				417	} else {
				418	formTokenWithChars(T, TokenPtr, tok::unknown_command);
				419	T.setUnknownCommandName(CommandName);
				420	Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
				421	<< SourceRange(T.getLocation(), T.getEndLocation());
				422	return;
				423	}
				424	}
				425	if (Info->IsVerbatimBlockCommand) {
				426	setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
				427	return;
				428	}
				429	if (Info->IsVerbatimLineCommand) {
				430	setupAndLexVerbatimLine(T, TokenPtr, Info);
				431	return;
				432	}
				433	formTokenWithChars(T, TokenPtr, CommandKind);
				434	T.setCommandID(Info->getID());
				435	return;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	436	}
Ilya Biryukov	1ff7c32	2018-05-16 12:30:09 +0000	[diff] [blame]	437
				438	case '&':
				439	lexHTMLCharacterReference(T);
				440	return;
				441
				442	case '<': {
				443	TokenPtr++;
				444	if (TokenPtr == CommentEnd) {
				445	formTextToken(T, TokenPtr);
				446	return;
				447	}
				448	const char C = *TokenPtr;
				449	if (isHTMLIdentifierStartingCharacter(C))
				450	setupAndLexHTMLStartTag(T);
				451	else if (C == '/')
				452	setupAndLexHTMLEndTag(T);
				453	else
				454	formTextToken(T, TokenPtr);
				455	return;
				456	}
				457
				458	default:
				459	return HandleNonCommandToken();
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	460	}
				461	}
				462
				463	void Lexer::setupAndLexVerbatimBlock(Token &T,
				464	const char *TextBegin,
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	465	char Marker, const CommandInfo *Info) {
				466	assert(Info->IsVerbatimBlockCommand);
				467
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	468	VerbatimBlockEndCommandName.clear();
				469	VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	470	VerbatimBlockEndCommandName.append(Info->EndCommandName);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	471
				472	formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	473	T.setVerbatimBlockID(Info->getID());
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	474
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	475	// If there is a newline following the verbatim opening command, skip the
				476	// newline so that we don't create an tok::verbatim_block_line with empty
				477	// text content.
Dmitri Gribenko	bcef341	2013-02-09 15:16:58 +0000	[diff] [blame]	478	if (BufferPtr != CommentEnd &&
				479	isVerticalWhitespace(*BufferPtr)) {
				480	BufferPtr = skipNewline(BufferPtr, CommentEnd);
				481	State = LS_VerbatimBlockBody;
				482	return;
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	483	}
				484
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	485	State = LS_VerbatimBlockFirstLine;
				486	}
				487
				488	void Lexer::lexVerbatimBlockFirstLine(Token &T) {
Dmitri Gribenko	e4a3997	2012-07-18 23:01:58 +0000	[diff] [blame]	489	again:
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	490	assert(BufferPtr < CommentEnd);
				491
				492	// FIXME: It would be better to scan the text once, finding either the block
				493	// end command or newline.
				494	//
				495	// Extract current line.
				496	const char *Newline = findNewline(BufferPtr, CommentEnd);
				497	StringRef Line(BufferPtr, Newline - BufferPtr);
				498
				499	// Look for end command in current line.
				500	size_t Pos = Line.find(VerbatimBlockEndCommandName);
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	501	const char *TextEnd;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	502	const char *NextLine;
				503	if (Pos == StringRef::npos) {
				504	// Current line is completely verbatim.
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	505	TextEnd = Newline;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	506	NextLine = skipNewline(Newline, CommentEnd);
				507	} else if (Pos == 0) {
				508	// Current line contains just an end command.
				509	const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
Dmitri Gribenko	60ddd8a	2012-06-27 16:30:35 +0000	[diff] [blame]	510	StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	511	formTokenWithChars(T, End, tok::verbatim_block_end);
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	512	T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	513	State = LS_Normal;
				514	return;
				515	} else {
				516	// There is some text, followed by end command. Extract text first.
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	517	TextEnd = BufferPtr + Pos;
				518	NextLine = TextEnd;
Dmitri Gribenko	e4a3997	2012-07-18 23:01:58 +0000	[diff] [blame]	519	// If there is only whitespace before end command, skip whitespace.
				520	if (isWhitespace(BufferPtr, TextEnd)) {
				521	BufferPtr = TextEnd;
				522	goto again;
				523	}
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	524	}
				525
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	526	StringRef Text(BufferPtr, TextEnd - BufferPtr);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	527	formTokenWithChars(T, NextLine, tok::verbatim_block_line);
Dmitri Gribenko	60ddd8a	2012-06-27 16:30:35 +0000	[diff] [blame]	528	T.setVerbatimBlockText(Text);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	529
				530	State = LS_VerbatimBlockBody;
				531	}
				532
				533	void Lexer::lexVerbatimBlockBody(Token &T) {
				534	assert(State == LS_VerbatimBlockBody);
				535
				536	if (CommentState == LCS_InsideCComment)
				537	skipLineStartingDecorations();
				538
Dmitri Gribenko	8b72062	2015-04-15 23:45:43 +0000	[diff] [blame]	539	if (BufferPtr == CommentEnd) {
				540	formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
				541	T.setVerbatimBlockText("");
				542	return;
				543	}
				544
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	545	lexVerbatimBlockFirstLine(T);
				546	}
				547
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	548	void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
				549	const CommandInfo *Info) {
				550	assert(Info->IsVerbatimLineCommand);
Dmitri Gribenko	1669f70	2012-06-27 16:53:58 +0000	[diff] [blame]	551	formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
Dmitri Gribenko	7acbf00	2012-09-10 20:32:42 +0000	[diff] [blame]	552	T.setVerbatimLineID(Info->getID());
Dmitri Gribenko	1669f70	2012-06-27 16:53:58 +0000	[diff] [blame]	553
				554	State = LS_VerbatimLineText;
				555	}
				556
				557	void Lexer::lexVerbatimLineText(Token &T) {
				558	assert(State == LS_VerbatimLineText);
				559
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	560	// Extract current line.
				561	const char *Newline = findNewline(BufferPtr, CommentEnd);
Craig Topper	bf3e327	2014-08-30 16:55:52 +0000	[diff] [blame]	562	StringRef Text(BufferPtr, Newline - BufferPtr);
Dmitri Gribenko	1669f70	2012-06-27 16:53:58 +0000	[diff] [blame]	563	formTokenWithChars(T, Newline, tok::verbatim_line_text);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	564	T.setVerbatimLineText(Text);
Dmitri Gribenko	1669f70	2012-06-27 16:53:58 +0000	[diff] [blame]	565
				566	State = LS_Normal;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	567	}
				568
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	569	void Lexer::lexHTMLCharacterReference(Token &T) {
				570	const char *TokenPtr = BufferPtr;
				571	assert(*TokenPtr == '&');
				572	TokenPtr++;
				573	if (TokenPtr == CommentEnd) {
				574	formTextToken(T, TokenPtr);
				575	return;
				576	}
				577	const char *NamePtr;
				578	bool isNamed = false;
				579	bool isDecimal = false;
				580	char C = *TokenPtr;
				581	if (isHTMLNamedCharacterReferenceCharacter(C)) {
				582	NamePtr = TokenPtr;
				583	TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
				584	isNamed = true;
				585	} else if (C == '#') {
				586	TokenPtr++;
				587	if (TokenPtr == CommentEnd) {
				588	formTextToken(T, TokenPtr);
				589	return;
				590	}
				591	C = *TokenPtr;
				592	if (isHTMLDecimalCharacterReferenceCharacter(C)) {
				593	NamePtr = TokenPtr;
				594	TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
				595	isDecimal = true;
				596	} else if (C == 'x' \|\| C == 'X') {
				597	TokenPtr++;
				598	NamePtr = TokenPtr;
				599	TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
				600	} else {
				601	formTextToken(T, TokenPtr);
				602	return;
				603	}
				604	} else {
				605	formTextToken(T, TokenPtr);
				606	return;
				607	}
				608	if (NamePtr == TokenPtr \|\| TokenPtr == CommentEnd \|\|
				609	*TokenPtr != ';') {
				610	formTextToken(T, TokenPtr);
				611	return;
				612	}
				613	StringRef Name(NamePtr, TokenPtr - NamePtr);
				614	TokenPtr++; // Skip semicolon.
				615	StringRef Resolved;
Dmitri Gribenko	28800da	2013-01-30 14:29:28 +0000	[diff] [blame]	616	if (isNamed)
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	617	Resolved = resolveHTMLNamedCharacterReference(Name);
				618	else if (isDecimal)
				619	Resolved = resolveHTMLDecimalCharacterReference(Name);
				620	else
				621	Resolved = resolveHTMLHexCharacterReference(Name);
				622
				623	if (Resolved.empty()) {
				624	formTextToken(T, TokenPtr);
				625	return;
				626	}
				627	formTokenWithChars(T, TokenPtr, tok::text);
				628	T.setText(Resolved);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	629	}
				630
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	631	void Lexer::setupAndLexHTMLStartTag(Token &T) {
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	632	assert(BufferPtr[0] == '<' &&
				633	isHTMLIdentifierStartingCharacter(BufferPtr[1]));
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	634	const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
Dmitri Gribenko	60ddd8a	2012-06-27 16:30:35 +0000	[diff] [blame]	635	StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
Dmitri Gribenko	107618a	2012-08-22 22:56:08 +0000	[diff] [blame]	636	if (!isHTMLTagName(Name)) {
				637	formTextToken(T, TagNameEnd);
				638	return;
				639	}
				640
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	641	formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
				642	T.setHTMLTagStartName(Name);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	643
				644	BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
				645
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	646	const char C = *BufferPtr;
				647	if (BufferPtr != CommentEnd &&
Dmitri Gribenko	f26054f	2012-07-11 21:38:39 +0000	[diff] [blame]	648	(C == '>' \|\| C == '/' \|\| isHTMLIdentifierStartingCharacter(C)))
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	649	State = LS_HTMLStartTag;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	650	}
				651
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	652	void Lexer::lexHTMLStartTag(Token &T) {
				653	assert(State == LS_HTMLStartTag);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	654
				655	const char *TokenPtr = BufferPtr;
				656	char C = *TokenPtr;
				657	if (isHTMLIdentifierCharacter(C)) {
				658	TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
Dmitri Gribenko	60ddd8a	2012-06-27 16:30:35 +0000	[diff] [blame]	659	StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	660	formTokenWithChars(T, TokenPtr, tok::html_ident);
Dmitri Gribenko	60ddd8a	2012-06-27 16:30:35 +0000	[diff] [blame]	661	T.setHTMLIdent(Ident);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	662	} else {
				663	switch (C) {
				664	case '=':
				665	TokenPtr++;
				666	formTokenWithChars(T, TokenPtr, tok::html_equals);
				667	break;
				668	case '\"':
				669	case '\'': {
				670	const char *OpenQuote = TokenPtr;
				671	TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
				672	const char *ClosingQuote = TokenPtr;
				673	if (TokenPtr != CommentEnd) // Skip closing quote.
				674	TokenPtr++;
				675	formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
				676	T.setHTMLQuotedString(StringRef(OpenQuote + 1,
				677	ClosingQuote - (OpenQuote + 1)));
				678	break;
				679	}
				680	case '>':
				681	TokenPtr++;
				682	formTokenWithChars(T, TokenPtr, tok::html_greater);
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	683	State = LS_Normal;
				684	return;
Dmitri Gribenko	f26054f	2012-07-11 21:38:39 +0000	[diff] [blame]	685	case '/':
				686	TokenPtr++;
				687	if (TokenPtr != CommentEnd && *TokenPtr == '>') {
				688	TokenPtr++;
				689	formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
Dmitri Gribenko	4586df7	2012-07-27 20:37:06 +0000	[diff] [blame]	690	} else
				691	formTextToken(T, TokenPtr);
				692
Dmitri Gribenko	f26054f	2012-07-11 21:38:39 +0000	[diff] [blame]	693	State = LS_Normal;
				694	return;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	695	}
				696	}
				697
				698	// Now look ahead and return to normal state if we don't see any HTML tokens
				699	// ahead.
				700	BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
				701	if (BufferPtr == CommentEnd) {
				702	State = LS_Normal;
				703	return;
				704	}
				705
				706	C = *BufferPtr;
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	707	if (!isHTMLIdentifierStartingCharacter(C) &&
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	708	C != '=' && C != '\"' && C != '\'' && C != '>') {
				709	State = LS_Normal;
				710	return;
				711	}
				712	}
				713
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	714	void Lexer::setupAndLexHTMLEndTag(Token &T) {
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	715	assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
				716
				717	const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
				718	const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
Dmitri Gribenko	107618a	2012-08-22 22:56:08 +0000	[diff] [blame]	719	StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
				720	if (!isHTMLTagName(Name)) {
				721	formTextToken(T, TagNameEnd);
				722	return;
				723	}
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	724
				725	const char *End = skipWhitespace(TagNameEnd, CommentEnd);
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	726
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	727	formTokenWithChars(T, End, tok::html_end_tag);
Dmitri Gribenko	107618a	2012-08-22 22:56:08 +0000	[diff] [blame]	728	T.setHTMLTagEndName(Name);
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	729
				730	if (BufferPtr != CommentEnd && *BufferPtr == '>')
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	731	State = LS_HTMLEndTag;
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	732	}
				733
Dmitri Gribenko	e00ffc7	2012-07-13 00:44:24 +0000	[diff] [blame]	734	void Lexer::lexHTMLEndTag(Token &T) {
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	735	assert(BufferPtr != CommentEnd && *BufferPtr == '>');
				736
				737	formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
				738	State = LS_Normal;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	739	}
				740
Fariborz Jahanian	5b63707	2013-05-03 23:15:20 +0000	[diff] [blame]	741	Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
Ilya Biryukov	1ff7c32	2018-05-16 12:30:09 +0000	[diff] [blame]	742	const CommandTraits &Traits, SourceLocation FileLoc,
				743	const char BufferStart, const char BufferEnd,
				744	bool ParseCommands)
				745	: Allocator(Allocator), Diags(Diags), Traits(Traits),
				746	BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc),
				747	BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal),
				748	ParseCommands(ParseCommands) {}
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	749
				750	void Lexer::lex(Token &T) {
				751	again:
				752	switch (CommentState) {
				753	case LCS_BeforeComment:
				754	if (BufferPtr == BufferEnd) {
				755	formTokenWithChars(T, BufferPtr, tok::eof);
				756	return;
				757	}
				758
				759	assert(*BufferPtr == '/');
				760	BufferPtr++; // Skip first slash.
				761	switch(*BufferPtr) {
				762	case '/': { // BCPL comment.
				763	BufferPtr++; // Skip second slash.
				764
				765	if (BufferPtr != BufferEnd) {
				766	// Skip Doxygen magic marker, if it is present.
				767	// It might be missing because of a typo //< or /*<, or because we
				768	// merged this non-Doxygen comment into a bunch of Doxygen comments
				769	// around it: /** ... / / ... / /* ... */
				770	const char C = *BufferPtr;
				771	if (C == '/' \|\| C == '!')
				772	BufferPtr++;
				773	}
				774
				775	// Skip less-than symbol that marks trailing comments.
				776	// Skip it even if the comment is not a Doxygen one, because //< and /*<
				777	// are frequent typos.
				778	if (BufferPtr != BufferEnd && *BufferPtr == '<')
				779	BufferPtr++;
				780
				781	CommentState = LCS_InsideBCPLComment;
Dmitri Gribenko	ec92531	2012-07-06 00:28:32 +0000	[diff] [blame]	782	if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
				783	State = LS_Normal;
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	784	CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
				785	goto again;
				786	}
				787	case '*': { // C comment.
				788	BufferPtr++; // Skip star.
				789
				790	// Skip Doxygen magic marker.
				791	const char C = *BufferPtr;
				792	if ((C == '' && (BufferPtr + 1) != '/') \|\| C == '!')
				793	BufferPtr++;
				794
				795	// Skip less-than symbol that marks trailing comments.
				796	if (BufferPtr != BufferEnd && *BufferPtr == '<')
				797	BufferPtr++;
				798
				799	CommentState = LCS_InsideCComment;
				800	State = LS_Normal;
				801	CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
				802	goto again;
				803	}
				804	default:
				805	llvm_unreachable("second character of comment should be '/' or '*'");
				806	}
				807
				808	case LCS_BetweenComments: {
				809	// Consecutive comments are extracted only if there is only whitespace
				810	// between them. So we can search for the start of the next comment.
				811	const char *EndWhitespace = BufferPtr;
				812	while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
				813	EndWhitespace++;
				814
				815	// Turn any whitespace between comments (and there is only whitespace
Dmitri Gribenko	17709ae	2012-07-09 21:32:40 +0000	[diff] [blame]	816	// between them -- guaranteed by comment extraction) into a newline. We
				817	// have two newlines between C comments in total (first one was synthesized
				818	// after a comment).
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	819	formTokenWithChars(T, EndWhitespace, tok::newline);
				820
				821	CommentState = LCS_BeforeComment;
				822	break;
				823	}
				824
				825	case LCS_InsideBCPLComment:
				826	case LCS_InsideCComment:
				827	if (BufferPtr != CommentEnd) {
				828	lexCommentText(T);
				829	break;
				830	} else {
				831	// Skip C comment closing sequence.
				832	if (CommentState == LCS_InsideCComment) {
				833	assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
				834	BufferPtr += 2;
				835	assert(BufferPtr <= BufferEnd);
				836
				837	// Synthenize newline just after the C comment, regardless if there is
				838	// actually a newline.
				839	formTokenWithChars(T, BufferPtr, tok::newline);
				840
				841	CommentState = LCS_BetweenComments;
				842	break;
				843	} else {
				844	// Don't synthesized a newline after BCPL comment.
				845	CommentState = LCS_BetweenComments;
				846	goto again;
				847	}
				848	}
				849	}
				850	}
				851
				852	StringRef Lexer::getSpelling(const Token &Tok,
Simon Pilgrim	4b8b7f2	2019-09-18 12:11:16 +0000	[diff] [blame]	853	const SourceManager &SourceMgr) const {
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	854	SourceLocation Loc = Tok.getLocation();
				855	std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
				856
				857	bool InvalidTemp = false;
				858	StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
Simon Pilgrim	4b8b7f2	2019-09-18 12:11:16 +0000	[diff] [blame]	859	if (InvalidTemp)
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	860	return StringRef();
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	861
				862	const char *Begin = File.data() + LocInfo.second;
				863	return StringRef(Begin, Tok.getLength());
				864	}
				865
Dmitri Gribenko	5188c4b	2012-06-26 20:39:18 +0000	[diff] [blame]	866	} // end namespace comments
				867	} // end namespace clang