Blame - lib/AST/CommentLexer.cpp - platform/external/clang

blob: 31468321cf4036b23731e300438262a98d841e05 [file] [log] [blame]

Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	1	#include "clang/AST/CommentLexer.h"
				2	#include "llvm/ADT/StringSwitch.h"
				3	#include "llvm/Support/ErrorHandling.h"
				4
				5	namespace clang {
				6	namespace comments {
				7
				8	void Token::dump(const Lexer &L, const SourceManager &SM) const {
				9	llvm::errs() << "comments::Token Kind=" << Kind << " ";
				10	Loc.dump(SM);
				11	llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
				12	}
				13
				14	bool Lexer::isVerbatimBlockCommand(StringRef BeginName,
				15	StringRef &EndName) const {
				16	const char Result = llvm::StringSwitch<const char >(BeginName)
				17	.Case("code", "endcode")
				18	.Case("verbatim", "endverbatim")
				19	.Case("htmlonly", "endhtmlonly")
				20	.Case("latexonly", "endlatexonly")
				21	.Case("xmlonly", "endxmlonly")
				22	.Case("manonly", "endmanonly")
				23	.Case("rtfonly", "endrtfonly")
				24
				25	.Case("dot", "enddot")
				26	.Case("msc", "endmsc")
				27
				28	.Case("f$", "f$") // Inline LaTeX formula
				29	.Case("f[", "f]") // Displayed LaTeX formula
				30	.Case("f{", "f}") // LaTeX environment
				31
				32	.Default(NULL);
				33
				34	if (Result) {
				35	EndName = Result;
				36	return true;
				37	}
				38
				39	for (VerbatimBlockCommandVector::const_iterator
				40	I = VerbatimBlockCommands.begin(),
				41	E = VerbatimBlockCommands.end();
				42	I != E; ++I)
				43	if (I->BeginName == BeginName) {
				44	EndName = I->EndName;
				45	return true;
				46	}
				47
				48	return false;
				49	}
				50
				51	bool Lexer::isVerbatimLineCommand(StringRef Name) const {
				52	bool Result = llvm::StringSwitch<bool>(Name)
				53	.Case("fn", true)
				54	.Case("var", true)
				55	.Case("property", true)
				56	.Case("typedef", true)
				57
				58	.Case("overload", true)
				59
				60	.Case("defgroup", true)
				61	.Case("ingroup", true)
				62	.Case("addtogroup", true)
				63	.Case("weakgroup", true)
				64	.Case("name", true)
				65
				66	.Case("section", true)
				67	.Case("subsection", true)
				68	.Case("subsubsection", true)
				69	.Case("paragraph", true)
				70
				71	.Case("mainpage", true)
				72	.Case("subpage", true)
				73	.Case("ref", true)
				74
				75	.Default(false);
				76
				77	if (Result)
				78	return true;
				79
				80	for (VerbatimLineCommandVector::const_iterator
				81	I = VerbatimLineCommands.begin(),
				82	E = VerbatimLineCommands.end();
				83	I != E; ++I)
				84	if (I->Name == Name)
				85	return true;
				86
				87	return false;
				88	}
				89
				90	void Lexer::skipLineStartingDecorations() {
				91	// This function should be called only for C comments
				92	assert(CommentState == LCS_InsideCComment);
				93
				94	if (BufferPtr == CommentEnd)
				95	return;
				96
				97	switch (*BufferPtr) {
				98	case ' ':
				99	case '\t':
				100	case '\f':
				101	case '\v': {
				102	const char *NewBufferPtr = BufferPtr;
				103	NewBufferPtr++;
				104	if (NewBufferPtr == CommentEnd)
				105	return;
				106
				107	char C = *NewBufferPtr;
				108	while (C == ' ' \|\| C == '\t' \|\| C == '\f' \|\| C == '\v') {
				109	NewBufferPtr++;
				110	if (NewBufferPtr == CommentEnd)
				111	return;
				112	C = *NewBufferPtr;
				113	}
				114	if (C == '*')
				115	BufferPtr = NewBufferPtr + 1;
				116	break;
				117	}
				118	case '*':
				119	BufferPtr++;
				120	break;
				121	}
				122	}
				123
				124	namespace {
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	125	/// Returns pointer to the first newline character in the string.
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	126	const char findNewline(const char BufferPtr, const char *BufferEnd) {
				127	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				128	const char C = *BufferPtr;
				129	if (C == '\n' \|\| C == '\r')
				130	return BufferPtr;
				131	}
				132	return BufferEnd;
				133	}
				134
				135	const char skipNewline(const char BufferPtr, const char *BufferEnd) {
				136	if (BufferPtr == BufferEnd)
				137	return BufferPtr;
				138
				139	if (*BufferPtr == '\n')
				140	BufferPtr++;
				141	else {
				142	assert(*BufferPtr == '\r');
				143	BufferPtr++;
				144	if (BufferPtr != BufferEnd && *BufferPtr == '\n')
				145	BufferPtr++;
				146	}
				147	return BufferPtr;
				148	}
				149
Dmitri Gribenko	a99ec10	2012-07-09 21:32:40 +0000	[diff] [blame]	150	bool isHTMLIdentifierStartingCharacter(char C) {
				151	return (C >= 'a' && C <= 'z') \|\|
				152	(C >= 'A' && C <= 'Z');
				153	}
				154
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	155	bool isHTMLIdentifierCharacter(char C) {
				156	return (C >= 'a' && C <= 'z') \|\|
				157	(C >= 'A' && C <= 'Z') \|\|
				158	(C >= '0' && C <= '9');
				159	}
				160
				161	const char skipHTMLIdentifier(const char BufferPtr, const char *BufferEnd) {
				162	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				163	if (!isHTMLIdentifierCharacter(*BufferPtr))
				164	return BufferPtr;
				165	}
				166	return BufferEnd;
				167	}
				168
				169	/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
				170	/// string allowed.
				171	///
				172	/// Returns pointer to closing quote.
				173	const char skipHTMLQuotedString(const char BufferPtr, const char *BufferEnd)
				174	{
				175	const char Quote = *BufferPtr;
				176	assert(Quote == '\"' \|\| Quote == '\'');
				177
				178	BufferPtr++;
				179	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				180	const char C = *BufferPtr;
				181	if (C == Quote && BufferPtr[-1] != '\\')
				182	return BufferPtr;
				183	}
				184	return BufferEnd;
				185	}
				186
				187	bool isHorizontalWhitespace(char C) {
				188	return C == ' ' \|\| C == '\t' \|\| C == '\f' \|\| C == '\v';
				189	}
				190
				191	bool isWhitespace(char C) {
				192	return C == ' ' \|\| C == '\n' \|\| C == '\r' \|\|
				193	C == '\t' \|\| C == '\f' \|\| C == '\v';
				194	}
				195
				196	const char skipWhitespace(const char BufferPtr, const char *BufferEnd) {
				197	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				198	if (!isWhitespace(*BufferPtr))
				199	return BufferPtr;
				200	}
				201	return BufferEnd;
				202	}
				203
Dmitri Gribenko	64da4e5	2012-07-18 23:01:58 +0000	[diff] [blame]	204	bool isWhitespace(const char BufferPtr, const char BufferEnd) {
				205	return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
				206	}
				207
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	208	bool isCommandNameCharacter(char C) {
				209	return (C >= 'a' && C <= 'z') \|\|
				210	(C >= 'A' && C <= 'Z') \|\|
				211	(C >= '0' && C <= '9');
				212	}
				213
				214	const char skipCommandName(const char BufferPtr, const char *BufferEnd) {
				215	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				216	if (!isCommandNameCharacter(*BufferPtr))
				217	return BufferPtr;
				218	}
				219	return BufferEnd;
				220	}
				221
				222	/// Return the one past end pointer for BCPL comments.
				223	/// Handles newlines escaped with backslash or trigraph for backslahs.
				224	const char findBCPLCommentEnd(const char BufferPtr, const char *BufferEnd) {
				225	const char *CurPtr = BufferPtr;
				226	while (CurPtr != BufferEnd) {
				227	char C = *CurPtr;
				228	while (C != '\n' && C != '\r') {
				229	CurPtr++;
				230	if (CurPtr == BufferEnd)
				231	return BufferEnd;
				232	C = *CurPtr;
				233	}
				234	// We found a newline, check if it is escaped.
				235	const char *EscapePtr = CurPtr - 1;
				236	while(isHorizontalWhitespace(*EscapePtr))
				237	EscapePtr--;
				238
				239	if (*EscapePtr == '\\' \|\|
				240	(EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
				241	EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
				242	// We found an escaped newline.
				243	CurPtr = skipNewline(CurPtr, BufferEnd);
				244	} else
				245	return CurPtr; // Not an escaped newline.
				246	}
				247	return BufferEnd;
				248	}
				249
				250	/// Return the one past end pointer for C comments.
				251	/// Very dumb, does not handle escaped newlines or trigraphs.
				252	const char findCCommentEnd(const char BufferPtr, const char *BufferEnd) {
				253	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				254	if (BufferPtr == '') {
				255	assert(BufferPtr + 1 != BufferEnd);
				256	if (*(BufferPtr + 1) == '/')
				257	return BufferPtr;
				258	}
				259	}
				260	llvm_unreachable("buffer end hit before '*/' was seen");
				261	}
				262	} // unnamed namespace
				263
				264	void Lexer::lexCommentText(Token &T) {
				265	assert(CommentState == LCS_InsideBCPLComment \|\|
				266	CommentState == LCS_InsideCComment);
				267
				268	switch (State) {
				269	case LS_Normal:
				270	break;
				271	case LS_VerbatimBlockFirstLine:
				272	lexVerbatimBlockFirstLine(T);
				273	return;
				274	case LS_VerbatimBlockBody:
				275	lexVerbatimBlockBody(T);
				276	return;
Dmitri Gribenko	962668d	2012-06-27 16:53:58 +0000	[diff] [blame]	277	case LS_VerbatimLineText:
				278	lexVerbatimLineText(T);
				279	return;
Dmitri Gribenko	3f38bf2	2012-07-13 00:44:24 +0000	[diff] [blame]	280	case LS_HTMLStartTag:
				281	lexHTMLStartTag(T);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	282	return;
Dmitri Gribenko	3f38bf2	2012-07-13 00:44:24 +0000	[diff] [blame]	283	case LS_HTMLEndTag:
				284	lexHTMLEndTag(T);
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	285	return;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	286	}
				287
				288	assert(State == LS_Normal);
				289
				290	const char *TokenPtr = BufferPtr;
				291	assert(TokenPtr < CommentEnd);
				292	while (TokenPtr != CommentEnd) {
				293	switch(*TokenPtr) {
				294	case '\\':
				295	case '@': {
				296	TokenPtr++;
				297	if (TokenPtr == CommentEnd) {
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	298	StringRef Text(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	299	formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	300	T.setText(Text);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	301	return;
				302	}
				303	char C = *TokenPtr;
				304	switch (C) {
				305	default:
				306	break;
				307
				308	case '\\': case '@': case '&': case '$':
				309	case '#': case '<': case '>': case '%':
				310	case '\"': case '.': case ':':
				311	// This is one of \\ \@ \& \$ etc escape sequences.
				312	TokenPtr++;
				313	if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
				314	// This is the \:: escape sequence.
				315	TokenPtr++;
				316	}
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	317	StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	318	formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	319	T.setText(UnescapedText);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	320	return;
				321	}
				322
				323	// Don't make zero-length commands.
				324	if (!isCommandNameCharacter(*TokenPtr)) {
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	325	StringRef Text(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	326	formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	327	T.setText(Text);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	328	return;
				329	}
				330
				331	TokenPtr = skipCommandName(TokenPtr, CommentEnd);
				332	unsigned Length = TokenPtr - (BufferPtr + 1);
				333
				334	// Hardcoded support for lexing LaTeX formula commands
				335	// \f$ \f[ \f] \f{ \f} as a single command.
				336	if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
				337	C = *TokenPtr;
				338	if (C == '$' \|\| C == '[' \|\| C == ']' \|\| C == '{' \|\| C == '}') {
				339	TokenPtr++;
				340	Length++;
				341	}
				342	}
				343
				344	const StringRef CommandName(BufferPtr + 1, Length);
				345	StringRef EndName;
				346
				347	if (isVerbatimBlockCommand(CommandName, EndName)) {
				348	setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, EndName);
				349	return;
				350	}
				351	if (isVerbatimLineCommand(CommandName)) {
Dmitri Gribenko	962668d	2012-06-27 16:53:58 +0000	[diff] [blame]	352	setupAndLexVerbatimLine(T, TokenPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	353	return;
				354	}
				355	formTokenWithChars(T, TokenPtr, tok::command);
				356	T.setCommandName(CommandName);
				357	return;
				358	}
				359
				360	case '<': {
				361	TokenPtr++;
				362	if (TokenPtr == CommentEnd) {
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	363	StringRef Text(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	364	formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	365	T.setText(Text);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	366	return;
				367	}
				368	const char C = *TokenPtr;
Dmitri Gribenko	a99ec10	2012-07-09 21:32:40 +0000	[diff] [blame]	369	if (isHTMLIdentifierStartingCharacter(C))
Dmitri Gribenko	3f38bf2	2012-07-13 00:44:24 +0000	[diff] [blame]	370	setupAndLexHTMLStartTag(T);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	371	else if (C == '/')
Dmitri Gribenko	3f38bf2	2012-07-13 00:44:24 +0000	[diff] [blame]	372	setupAndLexHTMLEndTag(T);
Dmitri Gribenko	5676d32	2012-06-27 23:28:29 +0000	[diff] [blame]	373	else {
				374	StringRef Text(BufferPtr, TokenPtr - BufferPtr);
				375	formTokenWithChars(T, TokenPtr, tok::text);
				376	T.setText(Text);
				377	}
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	378	return;
				379	}
				380
				381	case '\n':
				382	case '\r':
				383	TokenPtr = skipNewline(TokenPtr, CommentEnd);
				384	formTokenWithChars(T, TokenPtr, tok::newline);
				385
				386	if (CommentState == LCS_InsideCComment)
				387	skipLineStartingDecorations();
				388	return;
				389
				390	default: {
				391	while (true) {
				392	TokenPtr++;
				393	if (TokenPtr == CommentEnd)
				394	break;
Dmitri Gribenko	a99ec10	2012-07-09 21:32:40 +0000	[diff] [blame]	395	const char C = *TokenPtr;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	396	if(C == '\n' \|\| C == '\r' \|\|
				397	C == '\\' \|\| C == '@' \|\| C == '<')
				398	break;
				399	}
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	400	StringRef Text(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	401	formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	402	T.setText(Text);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	403	return;
				404	}
				405	}
				406	}
				407	}
				408
				409	void Lexer::setupAndLexVerbatimBlock(Token &T,
				410	const char *TextBegin,
				411	char Marker, StringRef EndName) {
				412	VerbatimBlockEndCommandName.clear();
				413	VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
				414	VerbatimBlockEndCommandName.append(EndName);
				415
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	416	StringRef Name(BufferPtr + 1, TextBegin - (BufferPtr + 1));
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	417	formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	418	T.setVerbatimBlockName(Name);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	419
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	420	// If there is a newline following the verbatim opening command, skip the
				421	// newline so that we don't create an tok::verbatim_block_line with empty
				422	// text content.
				423	if (BufferPtr != CommentEnd) {
				424	const char C = *BufferPtr;
				425	if (C == '\n' \|\| C == '\r') {
				426	BufferPtr = skipNewline(BufferPtr, CommentEnd);
				427	State = LS_VerbatimBlockBody;
				428	return;
				429	}
				430	}
				431
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	432	State = LS_VerbatimBlockFirstLine;
				433	}
				434
				435	void Lexer::lexVerbatimBlockFirstLine(Token &T) {
Dmitri Gribenko	64da4e5	2012-07-18 23:01:58 +0000	[diff] [blame]	436	again:
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	437	assert(BufferPtr < CommentEnd);
				438
				439	// FIXME: It would be better to scan the text once, finding either the block
				440	// end command or newline.
				441	//
				442	// Extract current line.
				443	const char *Newline = findNewline(BufferPtr, CommentEnd);
				444	StringRef Line(BufferPtr, Newline - BufferPtr);
				445
				446	// Look for end command in current line.
				447	size_t Pos = Line.find(VerbatimBlockEndCommandName);
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	448	const char *TextEnd;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	449	const char *NextLine;
				450	if (Pos == StringRef::npos) {
				451	// Current line is completely verbatim.
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	452	TextEnd = Newline;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	453	NextLine = skipNewline(Newline, CommentEnd);
				454	} else if (Pos == 0) {
				455	// Current line contains just an end command.
				456	const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	457	StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	458	formTokenWithChars(T, End, tok::verbatim_block_end);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	459	T.setVerbatimBlockName(Name);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	460	State = LS_Normal;
				461	return;
				462	} else {
				463	// There is some text, followed by end command. Extract text first.
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	464	TextEnd = BufferPtr + Pos;
				465	NextLine = TextEnd;
Dmitri Gribenko	64da4e5	2012-07-18 23:01:58 +0000	[diff] [blame]	466	// If there is only whitespace before end command, skip whitespace.
				467	if (isWhitespace(BufferPtr, TextEnd)) {
				468	BufferPtr = TextEnd;
				469	goto again;
				470	}
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	471	}
				472
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	473	StringRef Text(BufferPtr, TextEnd - BufferPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	474	formTokenWithChars(T, NextLine, tok::verbatim_block_line);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	475	T.setVerbatimBlockText(Text);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	476
				477	State = LS_VerbatimBlockBody;
				478	}
				479
				480	void Lexer::lexVerbatimBlockBody(Token &T) {
				481	assert(State == LS_VerbatimBlockBody);
				482
				483	if (CommentState == LCS_InsideCComment)
				484	skipLineStartingDecorations();
				485
				486	lexVerbatimBlockFirstLine(T);
				487	}
				488
Dmitri Gribenko	962668d	2012-06-27 16:53:58 +0000	[diff] [blame]	489	void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin) {
				490	const StringRef Name(BufferPtr + 1, TextBegin - BufferPtr - 1);
				491	formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
				492	T.setVerbatimLineName(Name);
				493
				494	State = LS_VerbatimLineText;
				495	}
				496
				497	void Lexer::lexVerbatimLineText(Token &T) {
				498	assert(State == LS_VerbatimLineText);
				499
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	500	// Extract current line.
				501	const char *Newline = findNewline(BufferPtr, CommentEnd);
Dmitri Gribenko	962668d	2012-06-27 16:53:58 +0000	[diff] [blame]	502	const StringRef Text(BufferPtr, Newline - BufferPtr);
				503	formTokenWithChars(T, Newline, tok::verbatim_line_text);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	504	T.setVerbatimLineText(Text);
Dmitri Gribenko	962668d	2012-06-27 16:53:58 +0000	[diff] [blame]	505
				506	State = LS_Normal;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	507	}
				508
Dmitri Gribenko	3f38bf2	2012-07-13 00:44:24 +0000	[diff] [blame]	509	void Lexer::setupAndLexHTMLStartTag(Token &T) {
Dmitri Gribenko	a99ec10	2012-07-09 21:32:40 +0000	[diff] [blame]	510	assert(BufferPtr[0] == '<' &&
				511	isHTMLIdentifierStartingCharacter(BufferPtr[1]));
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	512	const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
				513
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	514	StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
Dmitri Gribenko	3f38bf2	2012-07-13 00:44:24 +0000	[diff] [blame]	515	formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
				516	T.setHTMLTagStartName(Name);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	517
				518	BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
				519
Dmitri Gribenko	a99ec10	2012-07-09 21:32:40 +0000	[diff] [blame]	520	const char C = *BufferPtr;
				521	if (BufferPtr != CommentEnd &&
Dmitri Gribenko	a5ef44f	2012-07-11 21:38:39 +0000	[diff] [blame]	522	(C == '>' \|\| C == '/' \|\| isHTMLIdentifierStartingCharacter(C)))
Dmitri Gribenko	3f38bf2	2012-07-13 00:44:24 +0000	[diff] [blame]	523	State = LS_HTMLStartTag;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	524	}
				525
Dmitri Gribenko	3f38bf2	2012-07-13 00:44:24 +0000	[diff] [blame]	526	void Lexer::lexHTMLStartTag(Token &T) {
				527	assert(State == LS_HTMLStartTag);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	528
				529	const char *TokenPtr = BufferPtr;
				530	char C = *TokenPtr;
				531	if (isHTMLIdentifierCharacter(C)) {
				532	TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	533	StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	534	formTokenWithChars(T, TokenPtr, tok::html_ident);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	535	T.setHTMLIdent(Ident);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	536	} else {
				537	switch (C) {
				538	case '=':
				539	TokenPtr++;
				540	formTokenWithChars(T, TokenPtr, tok::html_equals);
				541	break;
				542	case '\"':
				543	case '\'': {
				544	const char *OpenQuote = TokenPtr;
				545	TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
				546	const char *ClosingQuote = TokenPtr;
				547	if (TokenPtr != CommentEnd) // Skip closing quote.
				548	TokenPtr++;
				549	formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
				550	T.setHTMLQuotedString(StringRef(OpenQuote + 1,
				551	ClosingQuote - (OpenQuote + 1)));
				552	break;
				553	}
				554	case '>':
				555	TokenPtr++;
				556	formTokenWithChars(T, TokenPtr, tok::html_greater);
Dmitri Gribenko	a99ec10	2012-07-09 21:32:40 +0000	[diff] [blame]	557	State = LS_Normal;
				558	return;
Dmitri Gribenko	a5ef44f	2012-07-11 21:38:39 +0000	[diff] [blame]	559	case '/':
				560	TokenPtr++;
				561	if (TokenPtr != CommentEnd && *TokenPtr == '>') {
				562	TokenPtr++;
				563	formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
				564	} else {
				565	StringRef Text(BufferPtr, TokenPtr - BufferPtr);
				566	formTokenWithChars(T, TokenPtr, tok::text);
				567	T.setText(Text);
				568	}
				569	State = LS_Normal;
				570	return;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	571	}
				572	}
				573
				574	// Now look ahead and return to normal state if we don't see any HTML tokens
				575	// ahead.
				576	BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
				577	if (BufferPtr == CommentEnd) {
				578	State = LS_Normal;
				579	return;
				580	}
				581
				582	C = *BufferPtr;
Dmitri Gribenko	a99ec10	2012-07-09 21:32:40 +0000	[diff] [blame]	583	if (!isHTMLIdentifierStartingCharacter(C) &&
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	584	C != '=' && C != '\"' && C != '\'' && C != '>') {
				585	State = LS_Normal;
				586	return;
				587	}
				588	}
				589
Dmitri Gribenko	3f38bf2	2012-07-13 00:44:24 +0000	[diff] [blame]	590	void Lexer::setupAndLexHTMLEndTag(Token &T) {
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	591	assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
				592
				593	const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
				594	const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
				595
				596	const char *End = skipWhitespace(TagNameEnd, CommentEnd);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	597
Dmitri Gribenko	3f38bf2	2012-07-13 00:44:24 +0000	[diff] [blame]	598	formTokenWithChars(T, End, tok::html_end_tag);
				599	T.setHTMLTagEndName(StringRef(TagNameBegin, TagNameEnd - TagNameBegin));
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	600
				601	if (BufferPtr != CommentEnd && *BufferPtr == '>')
Dmitri Gribenko	3f38bf2	2012-07-13 00:44:24 +0000	[diff] [blame]	602	State = LS_HTMLEndTag;
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	603	}
				604
Dmitri Gribenko	3f38bf2	2012-07-13 00:44:24 +0000	[diff] [blame]	605	void Lexer::lexHTMLEndTag(Token &T) {
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	606	assert(BufferPtr != CommentEnd && *BufferPtr == '>');
				607
				608	formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
				609	State = LS_Normal;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	610	}
				611
				612	Lexer::Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts,
				613	const char BufferStart, const char BufferEnd):
				614	BufferStart(BufferStart), BufferEnd(BufferEnd),
				615	FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart),
				616	CommentState(LCS_BeforeComment), State(LS_Normal) {
				617	}
				618
				619	void Lexer::lex(Token &T) {
				620	again:
				621	switch (CommentState) {
				622	case LCS_BeforeComment:
				623	if (BufferPtr == BufferEnd) {
				624	formTokenWithChars(T, BufferPtr, tok::eof);
				625	return;
				626	}
				627
				628	assert(*BufferPtr == '/');
				629	BufferPtr++; // Skip first slash.
				630	switch(*BufferPtr) {
				631	case '/': { // BCPL comment.
				632	BufferPtr++; // Skip second slash.
				633
				634	if (BufferPtr != BufferEnd) {
				635	// Skip Doxygen magic marker, if it is present.
				636	// It might be missing because of a typo //< or /*<, or because we
				637	// merged this non-Doxygen comment into a bunch of Doxygen comments
				638	// around it: /** ... / / ... / /* ... */
				639	const char C = *BufferPtr;
				640	if (C == '/' \|\| C == '!')
				641	BufferPtr++;
				642	}
				643
				644	// Skip less-than symbol that marks trailing comments.
				645	// Skip it even if the comment is not a Doxygen one, because //< and /*<
				646	// are frequent typos.
				647	if (BufferPtr != BufferEnd && *BufferPtr == '<')
				648	BufferPtr++;
				649
				650	CommentState = LCS_InsideBCPLComment;
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	651	if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
				652	State = LS_Normal;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	653	CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
				654	goto again;
				655	}
				656	case '*': { // C comment.
				657	BufferPtr++; // Skip star.
				658
				659	// Skip Doxygen magic marker.
				660	const char C = *BufferPtr;
				661	if ((C == '' && (BufferPtr + 1) != '/') \|\| C == '!')
				662	BufferPtr++;
				663
				664	// Skip less-than symbol that marks trailing comments.
				665	if (BufferPtr != BufferEnd && *BufferPtr == '<')
				666	BufferPtr++;
				667
				668	CommentState = LCS_InsideCComment;
				669	State = LS_Normal;
				670	CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
				671	goto again;
				672	}
				673	default:
				674	llvm_unreachable("second character of comment should be '/' or '*'");
				675	}
				676
				677	case LCS_BetweenComments: {
				678	// Consecutive comments are extracted only if there is only whitespace
				679	// between them. So we can search for the start of the next comment.
				680	const char *EndWhitespace = BufferPtr;
				681	while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
				682	EndWhitespace++;
				683
				684	// Turn any whitespace between comments (and there is only whitespace
Dmitri Gribenko	a99ec10	2012-07-09 21:32:40 +0000	[diff] [blame]	685	// between them -- guaranteed by comment extraction) into a newline. We
				686	// have two newlines between C comments in total (first one was synthesized
				687	// after a comment).
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	688	formTokenWithChars(T, EndWhitespace, tok::newline);
				689
				690	CommentState = LCS_BeforeComment;
				691	break;
				692	}
				693
				694	case LCS_InsideBCPLComment:
				695	case LCS_InsideCComment:
				696	if (BufferPtr != CommentEnd) {
				697	lexCommentText(T);
				698	break;
				699	} else {
				700	// Skip C comment closing sequence.
				701	if (CommentState == LCS_InsideCComment) {
				702	assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
				703	BufferPtr += 2;
				704	assert(BufferPtr <= BufferEnd);
				705
				706	// Synthenize newline just after the C comment, regardless if there is
				707	// actually a newline.
				708	formTokenWithChars(T, BufferPtr, tok::newline);
				709
				710	CommentState = LCS_BetweenComments;
				711	break;
				712	} else {
				713	// Don't synthesized a newline after BCPL comment.
				714	CommentState = LCS_BetweenComments;
				715	goto again;
				716	}
				717	}
				718	}
				719	}
				720
				721	StringRef Lexer::getSpelling(const Token &Tok,
				722	const SourceManager &SourceMgr,
				723	bool *Invalid) const {
				724	SourceLocation Loc = Tok.getLocation();
				725	std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
				726
				727	bool InvalidTemp = false;
				728	StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
				729	if (InvalidTemp) {
				730	*Invalid = true;
				731	return StringRef();
				732	}
				733
				734	const char *Begin = File.data() + LocInfo.second;
				735	return StringRef(Begin, Tok.getLength());
				736	}
				737
				738	void Lexer::addVerbatimBlockCommand(StringRef BeginName, StringRef EndName) {
				739	VerbatimBlockCommand VBC;
				740	VBC.BeginName = BeginName;
				741	VBC.EndName = EndName;
				742	VerbatimBlockCommands.push_back(VBC);
				743	}
				744
				745	void Lexer::addVerbatimLineCommand(StringRef Name) {
				746	VerbatimLineCommand VLC;
				747	VLC.Name = Name;
				748	VerbatimLineCommands.push_back(VLC);
				749	}
				750
				751	} // end namespace comments
				752	} // end namespace clang
				753