Blame - lib/AST/CommentLexer.cpp - platform/external/clang

blob: 77d2a9b72dd9a4104ddc0a4304d184618aba4b24 [file] [log] [blame]

Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	1	#include "clang/AST/CommentLexer.h"
				2	#include "llvm/ADT/StringSwitch.h"
				3	#include "llvm/Support/ErrorHandling.h"
				4
				5	namespace clang {
				6	namespace comments {
				7
				8	void Token::dump(const Lexer &L, const SourceManager &SM) const {
				9	llvm::errs() << "comments::Token Kind=" << Kind << " ";
				10	Loc.dump(SM);
				11	llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
				12	}
				13
				14	bool Lexer::isVerbatimBlockCommand(StringRef BeginName,
				15	StringRef &EndName) const {
				16	const char Result = llvm::StringSwitch<const char >(BeginName)
				17	.Case("code", "endcode")
				18	.Case("verbatim", "endverbatim")
				19	.Case("htmlonly", "endhtmlonly")
				20	.Case("latexonly", "endlatexonly")
				21	.Case("xmlonly", "endxmlonly")
				22	.Case("manonly", "endmanonly")
				23	.Case("rtfonly", "endrtfonly")
				24
				25	.Case("dot", "enddot")
				26	.Case("msc", "endmsc")
				27
				28	.Case("f$", "f$") // Inline LaTeX formula
				29	.Case("f[", "f]") // Displayed LaTeX formula
				30	.Case("f{", "f}") // LaTeX environment
				31
				32	.Default(NULL);
				33
				34	if (Result) {
				35	EndName = Result;
				36	return true;
				37	}
				38
				39	for (VerbatimBlockCommandVector::const_iterator
				40	I = VerbatimBlockCommands.begin(),
				41	E = VerbatimBlockCommands.end();
				42	I != E; ++I)
				43	if (I->BeginName == BeginName) {
				44	EndName = I->EndName;
				45	return true;
				46	}
				47
				48	return false;
				49	}
				50
				51	bool Lexer::isVerbatimLineCommand(StringRef Name) const {
				52	bool Result = llvm::StringSwitch<bool>(Name)
				53	.Case("fn", true)
				54	.Case("var", true)
				55	.Case("property", true)
				56	.Case("typedef", true)
				57
				58	.Case("overload", true)
				59
				60	.Case("defgroup", true)
				61	.Case("ingroup", true)
				62	.Case("addtogroup", true)
				63	.Case("weakgroup", true)
				64	.Case("name", true)
				65
				66	.Case("section", true)
				67	.Case("subsection", true)
				68	.Case("subsubsection", true)
				69	.Case("paragraph", true)
				70
				71	.Case("mainpage", true)
				72	.Case("subpage", true)
				73	.Case("ref", true)
				74
				75	.Default(false);
				76
				77	if (Result)
				78	return true;
				79
				80	for (VerbatimLineCommandVector::const_iterator
				81	I = VerbatimLineCommands.begin(),
				82	E = VerbatimLineCommands.end();
				83	I != E; ++I)
				84	if (I->Name == Name)
				85	return true;
				86
				87	return false;
				88	}
				89
				90	void Lexer::skipLineStartingDecorations() {
				91	// This function should be called only for C comments
				92	assert(CommentState == LCS_InsideCComment);
				93
				94	if (BufferPtr == CommentEnd)
				95	return;
				96
				97	switch (*BufferPtr) {
				98	case ' ':
				99	case '\t':
				100	case '\f':
				101	case '\v': {
				102	const char *NewBufferPtr = BufferPtr;
				103	NewBufferPtr++;
				104	if (NewBufferPtr == CommentEnd)
				105	return;
				106
				107	char C = *NewBufferPtr;
				108	while (C == ' ' \|\| C == '\t' \|\| C == '\f' \|\| C == '\v') {
				109	NewBufferPtr++;
				110	if (NewBufferPtr == CommentEnd)
				111	return;
				112	C = *NewBufferPtr;
				113	}
				114	if (C == '*')
				115	BufferPtr = NewBufferPtr + 1;
				116	break;
				117	}
				118	case '*':
				119	BufferPtr++;
				120	break;
				121	}
				122	}
				123
				124	namespace {
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	125	/// Returns pointer to the first newline character in the string.
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	126	const char findNewline(const char BufferPtr, const char *BufferEnd) {
				127	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				128	const char C = *BufferPtr;
				129	if (C == '\n' \|\| C == '\r')
				130	return BufferPtr;
				131	}
				132	return BufferEnd;
				133	}
				134
				135	const char skipNewline(const char BufferPtr, const char *BufferEnd) {
				136	if (BufferPtr == BufferEnd)
				137	return BufferPtr;
				138
				139	if (*BufferPtr == '\n')
				140	BufferPtr++;
				141	else {
				142	assert(*BufferPtr == '\r');
				143	BufferPtr++;
				144	if (BufferPtr != BufferEnd && *BufferPtr == '\n')
				145	BufferPtr++;
				146	}
				147	return BufferPtr;
				148	}
				149
				150	bool isHTMLIdentifierCharacter(char C) {
				151	return (C >= 'a' && C <= 'z') \|\|
				152	(C >= 'A' && C <= 'Z') \|\|
				153	(C >= '0' && C <= '9');
				154	}
				155
				156	const char skipHTMLIdentifier(const char BufferPtr, const char *BufferEnd) {
				157	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				158	if (!isHTMLIdentifierCharacter(*BufferPtr))
				159	return BufferPtr;
				160	}
				161	return BufferEnd;
				162	}
				163
				164	/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
				165	/// string allowed.
				166	///
				167	/// Returns pointer to closing quote.
				168	const char skipHTMLQuotedString(const char BufferPtr, const char *BufferEnd)
				169	{
				170	const char Quote = *BufferPtr;
				171	assert(Quote == '\"' \|\| Quote == '\'');
				172
				173	BufferPtr++;
				174	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				175	const char C = *BufferPtr;
				176	if (C == Quote && BufferPtr[-1] != '\\')
				177	return BufferPtr;
				178	}
				179	return BufferEnd;
				180	}
				181
				182	bool isHorizontalWhitespace(char C) {
				183	return C == ' ' \|\| C == '\t' \|\| C == '\f' \|\| C == '\v';
				184	}
				185
				186	bool isWhitespace(char C) {
				187	return C == ' ' \|\| C == '\n' \|\| C == '\r' \|\|
				188	C == '\t' \|\| C == '\f' \|\| C == '\v';
				189	}
				190
				191	const char skipWhitespace(const char BufferPtr, const char *BufferEnd) {
				192	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				193	if (!isWhitespace(*BufferPtr))
				194	return BufferPtr;
				195	}
				196	return BufferEnd;
				197	}
				198
				199	bool isCommandNameCharacter(char C) {
				200	return (C >= 'a' && C <= 'z') \|\|
				201	(C >= 'A' && C <= 'Z') \|\|
				202	(C >= '0' && C <= '9');
				203	}
				204
				205	const char skipCommandName(const char BufferPtr, const char *BufferEnd) {
				206	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				207	if (!isCommandNameCharacter(*BufferPtr))
				208	return BufferPtr;
				209	}
				210	return BufferEnd;
				211	}
				212
				213	/// Return the one past end pointer for BCPL comments.
				214	/// Handles newlines escaped with backslash or trigraph for backslahs.
				215	const char findBCPLCommentEnd(const char BufferPtr, const char *BufferEnd) {
				216	const char *CurPtr = BufferPtr;
				217	while (CurPtr != BufferEnd) {
				218	char C = *CurPtr;
				219	while (C != '\n' && C != '\r') {
				220	CurPtr++;
				221	if (CurPtr == BufferEnd)
				222	return BufferEnd;
				223	C = *CurPtr;
				224	}
				225	// We found a newline, check if it is escaped.
				226	const char *EscapePtr = CurPtr - 1;
				227	while(isHorizontalWhitespace(*EscapePtr))
				228	EscapePtr--;
				229
				230	if (*EscapePtr == '\\' \|\|
				231	(EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
				232	EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
				233	// We found an escaped newline.
				234	CurPtr = skipNewline(CurPtr, BufferEnd);
				235	} else
				236	return CurPtr; // Not an escaped newline.
				237	}
				238	return BufferEnd;
				239	}
				240
				241	/// Return the one past end pointer for C comments.
				242	/// Very dumb, does not handle escaped newlines or trigraphs.
				243	const char findCCommentEnd(const char BufferPtr, const char *BufferEnd) {
				244	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				245	if (BufferPtr == '') {
				246	assert(BufferPtr + 1 != BufferEnd);
				247	if (*(BufferPtr + 1) == '/')
				248	return BufferPtr;
				249	}
				250	}
				251	llvm_unreachable("buffer end hit before '*/' was seen");
				252	}
				253	} // unnamed namespace
				254
				255	void Lexer::lexCommentText(Token &T) {
				256	assert(CommentState == LCS_InsideBCPLComment \|\|
				257	CommentState == LCS_InsideCComment);
				258
				259	switch (State) {
				260	case LS_Normal:
				261	break;
				262	case LS_VerbatimBlockFirstLine:
				263	lexVerbatimBlockFirstLine(T);
				264	return;
				265	case LS_VerbatimBlockBody:
				266	lexVerbatimBlockBody(T);
				267	return;
Dmitri Gribenko	962668d	2012-06-27 16:53:58 +0000	[diff] [blame]	268	case LS_VerbatimLineText:
				269	lexVerbatimLineText(T);
				270	return;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	271	case LS_HTMLOpenTag:
				272	lexHTMLOpenTag(T);
				273	return;
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	274	case LS_HTMLCloseTag:
				275	lexHTMLCloseTag(T);
				276	return;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	277	}
				278
				279	assert(State == LS_Normal);
				280
				281	const char *TokenPtr = BufferPtr;
				282	assert(TokenPtr < CommentEnd);
				283	while (TokenPtr != CommentEnd) {
				284	switch(*TokenPtr) {
				285	case '\\':
				286	case '@': {
				287	TokenPtr++;
				288	if (TokenPtr == CommentEnd) {
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	289	StringRef Text(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	290	formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	291	T.setText(Text);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	292	return;
				293	}
				294	char C = *TokenPtr;
				295	switch (C) {
				296	default:
				297	break;
				298
				299	case '\\': case '@': case '&': case '$':
				300	case '#': case '<': case '>': case '%':
				301	case '\"': case '.': case ':':
				302	// This is one of \\ \@ \& \$ etc escape sequences.
				303	TokenPtr++;
				304	if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
				305	// This is the \:: escape sequence.
				306	TokenPtr++;
				307	}
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	308	StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	309	formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	310	T.setText(UnescapedText);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	311	return;
				312	}
				313
				314	// Don't make zero-length commands.
				315	if (!isCommandNameCharacter(*TokenPtr)) {
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	316	StringRef Text(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	317	formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	318	T.setText(Text);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	319	return;
				320	}
				321
				322	TokenPtr = skipCommandName(TokenPtr, CommentEnd);
				323	unsigned Length = TokenPtr - (BufferPtr + 1);
				324
				325	// Hardcoded support for lexing LaTeX formula commands
				326	// \f$ \f[ \f] \f{ \f} as a single command.
				327	if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
				328	C = *TokenPtr;
				329	if (C == '$' \|\| C == '[' \|\| C == ']' \|\| C == '{' \|\| C == '}') {
				330	TokenPtr++;
				331	Length++;
				332	}
				333	}
				334
				335	const StringRef CommandName(BufferPtr + 1, Length);
				336	StringRef EndName;
				337
				338	if (isVerbatimBlockCommand(CommandName, EndName)) {
				339	setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, EndName);
				340	return;
				341	}
				342	if (isVerbatimLineCommand(CommandName)) {
Dmitri Gribenko	962668d	2012-06-27 16:53:58 +0000	[diff] [blame]	343	setupAndLexVerbatimLine(T, TokenPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	344	return;
				345	}
				346	formTokenWithChars(T, TokenPtr, tok::command);
				347	T.setCommandName(CommandName);
				348	return;
				349	}
				350
				351	case '<': {
				352	TokenPtr++;
				353	if (TokenPtr == CommentEnd) {
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	354	StringRef Text(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	355	formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	356	T.setText(Text);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	357	return;
				358	}
				359	const char C = *TokenPtr;
				360	if (isHTMLIdentifierCharacter(C))
				361	setupAndLexHTMLOpenTag(T);
				362	else if (C == '/')
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	363	setupAndLexHTMLCloseTag(T);
Dmitri Gribenko	5676d32	2012-06-27 23:28:29 +0000	[diff] [blame]	364	else {
				365	StringRef Text(BufferPtr, TokenPtr - BufferPtr);
				366	formTokenWithChars(T, TokenPtr, tok::text);
				367	T.setText(Text);
				368	}
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	369	return;
				370	}
				371
				372	case '\n':
				373	case '\r':
				374	TokenPtr = skipNewline(TokenPtr, CommentEnd);
				375	formTokenWithChars(T, TokenPtr, tok::newline);
				376
				377	if (CommentState == LCS_InsideCComment)
				378	skipLineStartingDecorations();
				379	return;
				380
				381	default: {
				382	while (true) {
				383	TokenPtr++;
				384	if (TokenPtr == CommentEnd)
				385	break;
				386	char C = *TokenPtr;
				387	if(C == '\n' \|\| C == '\r' \|\|
				388	C == '\\' \|\| C == '@' \|\| C == '<')
				389	break;
				390	}
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	391	StringRef Text(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	392	formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	393	T.setText(Text);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	394	return;
				395	}
				396	}
				397	}
				398	}
				399
				400	void Lexer::setupAndLexVerbatimBlock(Token &T,
				401	const char *TextBegin,
				402	char Marker, StringRef EndName) {
				403	VerbatimBlockEndCommandName.clear();
				404	VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
				405	VerbatimBlockEndCommandName.append(EndName);
				406
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	407	StringRef Name(BufferPtr + 1, TextBegin - (BufferPtr + 1));
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	408	formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	409	T.setVerbatimBlockName(Name);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	410
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	411	// If there is a newline following the verbatim opening command, skip the
				412	// newline so that we don't create an tok::verbatim_block_line with empty
				413	// text content.
				414	if (BufferPtr != CommentEnd) {
				415	const char C = *BufferPtr;
				416	if (C == '\n' \|\| C == '\r') {
				417	BufferPtr = skipNewline(BufferPtr, CommentEnd);
				418	State = LS_VerbatimBlockBody;
				419	return;
				420	}
				421	}
				422
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	423	State = LS_VerbatimBlockFirstLine;
				424	}
				425
				426	void Lexer::lexVerbatimBlockFirstLine(Token &T) {
				427	assert(BufferPtr < CommentEnd);
				428
				429	// FIXME: It would be better to scan the text once, finding either the block
				430	// end command or newline.
				431	//
				432	// Extract current line.
				433	const char *Newline = findNewline(BufferPtr, CommentEnd);
				434	StringRef Line(BufferPtr, Newline - BufferPtr);
				435
				436	// Look for end command in current line.
				437	size_t Pos = Line.find(VerbatimBlockEndCommandName);
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	438	const char *TextEnd;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	439	const char *NextLine;
				440	if (Pos == StringRef::npos) {
				441	// Current line is completely verbatim.
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	442	TextEnd = Newline;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	443	NextLine = skipNewline(Newline, CommentEnd);
				444	} else if (Pos == 0) {
				445	// Current line contains just an end command.
				446	const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	447	StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	448	formTokenWithChars(T, End, tok::verbatim_block_end);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	449	T.setVerbatimBlockName(Name);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	450	State = LS_Normal;
				451	return;
				452	} else {
				453	// There is some text, followed by end command. Extract text first.
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	454	TextEnd = BufferPtr + Pos;
				455	NextLine = TextEnd;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	456	}
				457
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	458	StringRef Text(BufferPtr, TextEnd - BufferPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	459	formTokenWithChars(T, NextLine, tok::verbatim_block_line);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	460	T.setVerbatimBlockText(Text);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	461
				462	State = LS_VerbatimBlockBody;
				463	}
				464
				465	void Lexer::lexVerbatimBlockBody(Token &T) {
				466	assert(State == LS_VerbatimBlockBody);
				467
				468	if (CommentState == LCS_InsideCComment)
				469	skipLineStartingDecorations();
				470
				471	lexVerbatimBlockFirstLine(T);
				472	}
				473
Dmitri Gribenko	962668d	2012-06-27 16:53:58 +0000	[diff] [blame]	474	void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin) {
				475	const StringRef Name(BufferPtr + 1, TextBegin - BufferPtr - 1);
				476	formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
				477	T.setVerbatimLineName(Name);
				478
				479	State = LS_VerbatimLineText;
				480	}
				481
				482	void Lexer::lexVerbatimLineText(Token &T) {
				483	assert(State == LS_VerbatimLineText);
				484
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	485	// Extract current line.
				486	const char *Newline = findNewline(BufferPtr, CommentEnd);
Dmitri Gribenko	962668d	2012-06-27 16:53:58 +0000	[diff] [blame]	487	const StringRef Text(BufferPtr, Newline - BufferPtr);
				488	formTokenWithChars(T, Newline, tok::verbatim_line_text);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	489	T.setVerbatimLineText(Text);
Dmitri Gribenko	962668d	2012-06-27 16:53:58 +0000	[diff] [blame]	490
				491	State = LS_Normal;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	492	}
				493
				494	void Lexer::setupAndLexHTMLOpenTag(Token &T) {
				495	assert(BufferPtr[0] == '<' && isHTMLIdentifierCharacter(BufferPtr[1]));
				496	const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
				497
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	498	StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	499	formTokenWithChars(T, TagNameEnd, tok::html_tag_open);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	500	T.setHTMLTagOpenName(Name);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	501
				502	BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
				503
				504	if (BufferPtr != CommentEnd && *BufferPtr == '>') {
				505	BufferPtr++;
				506	return;
				507	}
				508
				509	if (BufferPtr != CommentEnd && isHTMLIdentifierCharacter(*BufferPtr))
				510	State = LS_HTMLOpenTag;
				511	}
				512
				513	void Lexer::lexHTMLOpenTag(Token &T) {
				514	assert(State == LS_HTMLOpenTag);
				515
				516	const char *TokenPtr = BufferPtr;
				517	char C = *TokenPtr;
				518	if (isHTMLIdentifierCharacter(C)) {
				519	TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	520	StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	521	formTokenWithChars(T, TokenPtr, tok::html_ident);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	522	T.setHTMLIdent(Ident);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	523	} else {
				524	switch (C) {
				525	case '=':
				526	TokenPtr++;
				527	formTokenWithChars(T, TokenPtr, tok::html_equals);
				528	break;
				529	case '\"':
				530	case '\'': {
				531	const char *OpenQuote = TokenPtr;
				532	TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
				533	const char *ClosingQuote = TokenPtr;
				534	if (TokenPtr != CommentEnd) // Skip closing quote.
				535	TokenPtr++;
				536	formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
				537	T.setHTMLQuotedString(StringRef(OpenQuote + 1,
				538	ClosingQuote - (OpenQuote + 1)));
				539	break;
				540	}
				541	case '>':
				542	TokenPtr++;
				543	formTokenWithChars(T, TokenPtr, tok::html_greater);
				544	break;
				545	}
				546	}
				547
				548	// Now look ahead and return to normal state if we don't see any HTML tokens
				549	// ahead.
				550	BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
				551	if (BufferPtr == CommentEnd) {
				552	State = LS_Normal;
				553	return;
				554	}
				555
				556	C = *BufferPtr;
				557	if (!isHTMLIdentifierCharacter(C) &&
				558	C != '=' && C != '\"' && C != '\'' && C != '>') {
				559	State = LS_Normal;
				560	return;
				561	}
				562	}
				563
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	564	void Lexer::setupAndLexHTMLCloseTag(Token &T) {
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	565	assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
				566
				567	const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
				568	const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
				569
				570	const char *End = skipWhitespace(TagNameEnd, CommentEnd);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	571
				572	formTokenWithChars(T, End, tok::html_tag_close);
				573	T.setHTMLTagCloseName(StringRef(TagNameBegin, TagNameEnd - TagNameBegin));
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	574
				575	if (BufferPtr != CommentEnd && *BufferPtr == '>')
				576	State = LS_HTMLCloseTag;
				577	}
				578
				579	void Lexer::lexHTMLCloseTag(Token &T) {
				580	assert(BufferPtr != CommentEnd && *BufferPtr == '>');
				581
				582	formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
				583	State = LS_Normal;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	584	}
				585
				586	Lexer::Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts,
				587	const char BufferStart, const char BufferEnd):
				588	BufferStart(BufferStart), BufferEnd(BufferEnd),
				589	FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart),
				590	CommentState(LCS_BeforeComment), State(LS_Normal) {
				591	}
				592
				593	void Lexer::lex(Token &T) {
				594	again:
				595	switch (CommentState) {
				596	case LCS_BeforeComment:
				597	if (BufferPtr == BufferEnd) {
				598	formTokenWithChars(T, BufferPtr, tok::eof);
				599	return;
				600	}
				601
				602	assert(*BufferPtr == '/');
				603	BufferPtr++; // Skip first slash.
				604	switch(*BufferPtr) {
				605	case '/': { // BCPL comment.
				606	BufferPtr++; // Skip second slash.
				607
				608	if (BufferPtr != BufferEnd) {
				609	// Skip Doxygen magic marker, if it is present.
				610	// It might be missing because of a typo //< or /*<, or because we
				611	// merged this non-Doxygen comment into a bunch of Doxygen comments
				612	// around it: /** ... / / ... / /* ... */
				613	const char C = *BufferPtr;
				614	if (C == '/' \|\| C == '!')
				615	BufferPtr++;
				616	}
				617
				618	// Skip less-than symbol that marks trailing comments.
				619	// Skip it even if the comment is not a Doxygen one, because //< and /*<
				620	// are frequent typos.
				621	if (BufferPtr != BufferEnd && *BufferPtr == '<')
				622	BufferPtr++;
				623
				624	CommentState = LCS_InsideBCPLComment;
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	625	if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
				626	State = LS_Normal;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	627	CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
				628	goto again;
				629	}
				630	case '*': { // C comment.
				631	BufferPtr++; // Skip star.
				632
				633	// Skip Doxygen magic marker.
				634	const char C = *BufferPtr;
				635	if ((C == '' && (BufferPtr + 1) != '/') \|\| C == '!')
				636	BufferPtr++;
				637
				638	// Skip less-than symbol that marks trailing comments.
				639	if (BufferPtr != BufferEnd && *BufferPtr == '<')
				640	BufferPtr++;
				641
				642	CommentState = LCS_InsideCComment;
				643	State = LS_Normal;
				644	CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
				645	goto again;
				646	}
				647	default:
				648	llvm_unreachable("second character of comment should be '/' or '*'");
				649	}
				650
				651	case LCS_BetweenComments: {
				652	// Consecutive comments are extracted only if there is only whitespace
				653	// between them. So we can search for the start of the next comment.
				654	const char *EndWhitespace = BufferPtr;
				655	while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
				656	EndWhitespace++;
				657
				658	// Turn any whitespace between comments (and there is only whitespace
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	659	// between them) into a newline. We have two newlines between C comments
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	660	// in total (first one was synthesized after a comment).
				661	formTokenWithChars(T, EndWhitespace, tok::newline);
				662
				663	CommentState = LCS_BeforeComment;
				664	break;
				665	}
				666
				667	case LCS_InsideBCPLComment:
				668	case LCS_InsideCComment:
				669	if (BufferPtr != CommentEnd) {
				670	lexCommentText(T);
				671	break;
				672	} else {
				673	// Skip C comment closing sequence.
				674	if (CommentState == LCS_InsideCComment) {
				675	assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
				676	BufferPtr += 2;
				677	assert(BufferPtr <= BufferEnd);
				678
				679	// Synthenize newline just after the C comment, regardless if there is
				680	// actually a newline.
				681	formTokenWithChars(T, BufferPtr, tok::newline);
				682
				683	CommentState = LCS_BetweenComments;
				684	break;
				685	} else {
				686	// Don't synthesized a newline after BCPL comment.
				687	CommentState = LCS_BetweenComments;
				688	goto again;
				689	}
				690	}
				691	}
				692	}
				693
				694	StringRef Lexer::getSpelling(const Token &Tok,
				695	const SourceManager &SourceMgr,
				696	bool *Invalid) const {
				697	SourceLocation Loc = Tok.getLocation();
				698	std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
				699
				700	bool InvalidTemp = false;
				701	StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
				702	if (InvalidTemp) {
				703	*Invalid = true;
				704	return StringRef();
				705	}
				706
				707	const char *Begin = File.data() + LocInfo.second;
				708	return StringRef(Begin, Tok.getLength());
				709	}
				710
				711	void Lexer::addVerbatimBlockCommand(StringRef BeginName, StringRef EndName) {
				712	VerbatimBlockCommand VBC;
				713	VBC.BeginName = BeginName;
				714	VBC.EndName = EndName;
				715	VerbatimBlockCommands.push_back(VBC);
				716	}
				717
				718	void Lexer::addVerbatimLineCommand(StringRef Name) {
				719	VerbatimLineCommand VLC;
				720	VLC.Name = Name;
				721	VerbatimLineCommands.push_back(VLC);
				722	}
				723
				724	} // end namespace comments
				725	} // end namespace clang
				726