Blame - lib/AST/CommentLexer.cpp - platform/external/clang

blob: 1f4955d1cf2646905853dc85b7c823be975ac88c [file] [log] [blame]

Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	1	#include "clang/AST/CommentLexer.h"
				2	#include "llvm/ADT/StringSwitch.h"
				3	#include "llvm/Support/ErrorHandling.h"
				4
				5	namespace clang {
				6	namespace comments {
				7
				8	void Token::dump(const Lexer &L, const SourceManager &SM) const {
				9	llvm::errs() << "comments::Token Kind=" << Kind << " ";
				10	Loc.dump(SM);
				11	llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
				12	}
				13
				14	bool Lexer::isVerbatimBlockCommand(StringRef BeginName,
				15	StringRef &EndName) const {
				16	const char Result = llvm::StringSwitch<const char >(BeginName)
				17	.Case("code", "endcode")
				18	.Case("verbatim", "endverbatim")
				19	.Case("htmlonly", "endhtmlonly")
				20	.Case("latexonly", "endlatexonly")
				21	.Case("xmlonly", "endxmlonly")
				22	.Case("manonly", "endmanonly")
				23	.Case("rtfonly", "endrtfonly")
				24
				25	.Case("dot", "enddot")
				26	.Case("msc", "endmsc")
				27
				28	.Case("f$", "f$") // Inline LaTeX formula
				29	.Case("f[", "f]") // Displayed LaTeX formula
				30	.Case("f{", "f}") // LaTeX environment
				31
				32	.Default(NULL);
				33
				34	if (Result) {
				35	EndName = Result;
				36	return true;
				37	}
				38
				39	for (VerbatimBlockCommandVector::const_iterator
				40	I = VerbatimBlockCommands.begin(),
				41	E = VerbatimBlockCommands.end();
				42	I != E; ++I)
				43	if (I->BeginName == BeginName) {
				44	EndName = I->EndName;
				45	return true;
				46	}
				47
				48	return false;
				49	}
				50
				51	bool Lexer::isVerbatimLineCommand(StringRef Name) const {
				52	bool Result = llvm::StringSwitch<bool>(Name)
				53	.Case("fn", true)
				54	.Case("var", true)
				55	.Case("property", true)
				56	.Case("typedef", true)
				57
				58	.Case("overload", true)
				59
				60	.Case("defgroup", true)
				61	.Case("ingroup", true)
				62	.Case("addtogroup", true)
				63	.Case("weakgroup", true)
				64	.Case("name", true)
				65
				66	.Case("section", true)
				67	.Case("subsection", true)
				68	.Case("subsubsection", true)
				69	.Case("paragraph", true)
				70
				71	.Case("mainpage", true)
				72	.Case("subpage", true)
				73	.Case("ref", true)
				74
				75	.Default(false);
				76
				77	if (Result)
				78	return true;
				79
				80	for (VerbatimLineCommandVector::const_iterator
				81	I = VerbatimLineCommands.begin(),
				82	E = VerbatimLineCommands.end();
				83	I != E; ++I)
				84	if (I->Name == Name)
				85	return true;
				86
				87	return false;
				88	}
				89
				90	void Lexer::skipLineStartingDecorations() {
				91	// This function should be called only for C comments
				92	assert(CommentState == LCS_InsideCComment);
				93
				94	if (BufferPtr == CommentEnd)
				95	return;
				96
				97	switch (*BufferPtr) {
				98	case ' ':
				99	case '\t':
				100	case '\f':
				101	case '\v': {
				102	const char *NewBufferPtr = BufferPtr;
				103	NewBufferPtr++;
				104	if (NewBufferPtr == CommentEnd)
				105	return;
				106
				107	char C = *NewBufferPtr;
				108	while (C == ' ' \|\| C == '\t' \|\| C == '\f' \|\| C == '\v') {
				109	NewBufferPtr++;
				110	if (NewBufferPtr == CommentEnd)
				111	return;
				112	C = *NewBufferPtr;
				113	}
				114	if (C == '*')
				115	BufferPtr = NewBufferPtr + 1;
				116	break;
				117	}
				118	case '*':
				119	BufferPtr++;
				120	break;
				121	}
				122	}
				123
				124	namespace {
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	125	/// Returns pointer to the first newline character in the string.
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	126	const char findNewline(const char BufferPtr, const char *BufferEnd) {
				127	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				128	const char C = *BufferPtr;
				129	if (C == '\n' \|\| C == '\r')
				130	return BufferPtr;
				131	}
				132	return BufferEnd;
				133	}
				134
				135	const char skipNewline(const char BufferPtr, const char *BufferEnd) {
				136	if (BufferPtr == BufferEnd)
				137	return BufferPtr;
				138
				139	if (*BufferPtr == '\n')
				140	BufferPtr++;
				141	else {
				142	assert(*BufferPtr == '\r');
				143	BufferPtr++;
				144	if (BufferPtr != BufferEnd && *BufferPtr == '\n')
				145	BufferPtr++;
				146	}
				147	return BufferPtr;
				148	}
				149
Dmitri Gribenko	a99ec10	2012-07-09 21:32:40 +0000	[diff] [blame]	150	bool isHTMLIdentifierStartingCharacter(char C) {
				151	return (C >= 'a' && C <= 'z') \|\|
				152	(C >= 'A' && C <= 'Z');
				153	}
				154
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	155	bool isHTMLIdentifierCharacter(char C) {
				156	return (C >= 'a' && C <= 'z') \|\|
				157	(C >= 'A' && C <= 'Z') \|\|
				158	(C >= '0' && C <= '9');
				159	}
				160
				161	const char skipHTMLIdentifier(const char BufferPtr, const char *BufferEnd) {
				162	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				163	if (!isHTMLIdentifierCharacter(*BufferPtr))
				164	return BufferPtr;
				165	}
				166	return BufferEnd;
				167	}
				168
				169	/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
				170	/// string allowed.
				171	///
				172	/// Returns pointer to closing quote.
				173	const char skipHTMLQuotedString(const char BufferPtr, const char *BufferEnd)
				174	{
				175	const char Quote = *BufferPtr;
				176	assert(Quote == '\"' \|\| Quote == '\'');
				177
				178	BufferPtr++;
				179	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				180	const char C = *BufferPtr;
				181	if (C == Quote && BufferPtr[-1] != '\\')
				182	return BufferPtr;
				183	}
				184	return BufferEnd;
				185	}
				186
				187	bool isHorizontalWhitespace(char C) {
				188	return C == ' ' \|\| C == '\t' \|\| C == '\f' \|\| C == '\v';
				189	}
				190
				191	bool isWhitespace(char C) {
				192	return C == ' ' \|\| C == '\n' \|\| C == '\r' \|\|
				193	C == '\t' \|\| C == '\f' \|\| C == '\v';
				194	}
				195
				196	const char skipWhitespace(const char BufferPtr, const char *BufferEnd) {
				197	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				198	if (!isWhitespace(*BufferPtr))
				199	return BufferPtr;
				200	}
				201	return BufferEnd;
				202	}
				203
				204	bool isCommandNameCharacter(char C) {
				205	return (C >= 'a' && C <= 'z') \|\|
				206	(C >= 'A' && C <= 'Z') \|\|
				207	(C >= '0' && C <= '9');
				208	}
				209
				210	const char skipCommandName(const char BufferPtr, const char *BufferEnd) {
				211	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				212	if (!isCommandNameCharacter(*BufferPtr))
				213	return BufferPtr;
				214	}
				215	return BufferEnd;
				216	}
				217
				218	/// Return the one past end pointer for BCPL comments.
				219	/// Handles newlines escaped with backslash or trigraph for backslahs.
				220	const char findBCPLCommentEnd(const char BufferPtr, const char *BufferEnd) {
				221	const char *CurPtr = BufferPtr;
				222	while (CurPtr != BufferEnd) {
				223	char C = *CurPtr;
				224	while (C != '\n' && C != '\r') {
				225	CurPtr++;
				226	if (CurPtr == BufferEnd)
				227	return BufferEnd;
				228	C = *CurPtr;
				229	}
				230	// We found a newline, check if it is escaped.
				231	const char *EscapePtr = CurPtr - 1;
				232	while(isHorizontalWhitespace(*EscapePtr))
				233	EscapePtr--;
				234
				235	if (*EscapePtr == '\\' \|\|
				236	(EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
				237	EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
				238	// We found an escaped newline.
				239	CurPtr = skipNewline(CurPtr, BufferEnd);
				240	} else
				241	return CurPtr; // Not an escaped newline.
				242	}
				243	return BufferEnd;
				244	}
				245
				246	/// Return the one past end pointer for C comments.
				247	/// Very dumb, does not handle escaped newlines or trigraphs.
				248	const char findCCommentEnd(const char BufferPtr, const char *BufferEnd) {
				249	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
				250	if (BufferPtr == '') {
				251	assert(BufferPtr + 1 != BufferEnd);
				252	if (*(BufferPtr + 1) == '/')
				253	return BufferPtr;
				254	}
				255	}
				256	llvm_unreachable("buffer end hit before '*/' was seen");
				257	}
				258	} // unnamed namespace
				259
				260	void Lexer::lexCommentText(Token &T) {
				261	assert(CommentState == LCS_InsideBCPLComment \|\|
				262	CommentState == LCS_InsideCComment);
				263
				264	switch (State) {
				265	case LS_Normal:
				266	break;
				267	case LS_VerbatimBlockFirstLine:
				268	lexVerbatimBlockFirstLine(T);
				269	return;
				270	case LS_VerbatimBlockBody:
				271	lexVerbatimBlockBody(T);
				272	return;
Dmitri Gribenko	962668d	2012-06-27 16:53:58 +0000	[diff] [blame]	273	case LS_VerbatimLineText:
				274	lexVerbatimLineText(T);
				275	return;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	276	case LS_HTMLOpenTag:
				277	lexHTMLOpenTag(T);
				278	return;
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	279	case LS_HTMLCloseTag:
				280	lexHTMLCloseTag(T);
				281	return;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	282	}
				283
				284	assert(State == LS_Normal);
				285
				286	const char *TokenPtr = BufferPtr;
				287	assert(TokenPtr < CommentEnd);
				288	while (TokenPtr != CommentEnd) {
				289	switch(*TokenPtr) {
				290	case '\\':
				291	case '@': {
				292	TokenPtr++;
				293	if (TokenPtr == CommentEnd) {
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	294	StringRef Text(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	295	formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	296	T.setText(Text);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	297	return;
				298	}
				299	char C = *TokenPtr;
				300	switch (C) {
				301	default:
				302	break;
				303
				304	case '\\': case '@': case '&': case '$':
				305	case '#': case '<': case '>': case '%':
				306	case '\"': case '.': case ':':
				307	// This is one of \\ \@ \& \$ etc escape sequences.
				308	TokenPtr++;
				309	if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
				310	// This is the \:: escape sequence.
				311	TokenPtr++;
				312	}
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	313	StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	314	formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	315	T.setText(UnescapedText);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	316	return;
				317	}
				318
				319	// Don't make zero-length commands.
				320	if (!isCommandNameCharacter(*TokenPtr)) {
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	321	StringRef Text(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	322	formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	323	T.setText(Text);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	324	return;
				325	}
				326
				327	TokenPtr = skipCommandName(TokenPtr, CommentEnd);
				328	unsigned Length = TokenPtr - (BufferPtr + 1);
				329
				330	// Hardcoded support for lexing LaTeX formula commands
				331	// \f$ \f[ \f] \f{ \f} as a single command.
				332	if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
				333	C = *TokenPtr;
				334	if (C == '$' \|\| C == '[' \|\| C == ']' \|\| C == '{' \|\| C == '}') {
				335	TokenPtr++;
				336	Length++;
				337	}
				338	}
				339
				340	const StringRef CommandName(BufferPtr + 1, Length);
				341	StringRef EndName;
				342
				343	if (isVerbatimBlockCommand(CommandName, EndName)) {
				344	setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, EndName);
				345	return;
				346	}
				347	if (isVerbatimLineCommand(CommandName)) {
Dmitri Gribenko	962668d	2012-06-27 16:53:58 +0000	[diff] [blame]	348	setupAndLexVerbatimLine(T, TokenPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	349	return;
				350	}
				351	formTokenWithChars(T, TokenPtr, tok::command);
				352	T.setCommandName(CommandName);
				353	return;
				354	}
				355
				356	case '<': {
				357	TokenPtr++;
				358	if (TokenPtr == CommentEnd) {
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	359	StringRef Text(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	360	formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	361	T.setText(Text);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	362	return;
				363	}
				364	const char C = *TokenPtr;
Dmitri Gribenko	a99ec10	2012-07-09 21:32:40 +0000	[diff] [blame]	365	if (isHTMLIdentifierStartingCharacter(C))
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	366	setupAndLexHTMLOpenTag(T);
				367	else if (C == '/')
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	368	setupAndLexHTMLCloseTag(T);
Dmitri Gribenko	5676d32	2012-06-27 23:28:29 +0000	[diff] [blame]	369	else {
				370	StringRef Text(BufferPtr, TokenPtr - BufferPtr);
				371	formTokenWithChars(T, TokenPtr, tok::text);
				372	T.setText(Text);
				373	}
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	374	return;
				375	}
				376
				377	case '\n':
				378	case '\r':
				379	TokenPtr = skipNewline(TokenPtr, CommentEnd);
				380	formTokenWithChars(T, TokenPtr, tok::newline);
				381
				382	if (CommentState == LCS_InsideCComment)
				383	skipLineStartingDecorations();
				384	return;
				385
				386	default: {
				387	while (true) {
				388	TokenPtr++;
				389	if (TokenPtr == CommentEnd)
				390	break;
Dmitri Gribenko	a99ec10	2012-07-09 21:32:40 +0000	[diff] [blame]	391	const char C = *TokenPtr;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	392	if(C == '\n' \|\| C == '\r' \|\|
				393	C == '\\' \|\| C == '@' \|\| C == '<')
				394	break;
				395	}
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	396	StringRef Text(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	397	formTokenWithChars(T, TokenPtr, tok::text);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	398	T.setText(Text);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	399	return;
				400	}
				401	}
				402	}
				403	}
				404
				405	void Lexer::setupAndLexVerbatimBlock(Token &T,
				406	const char *TextBegin,
				407	char Marker, StringRef EndName) {
				408	VerbatimBlockEndCommandName.clear();
				409	VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
				410	VerbatimBlockEndCommandName.append(EndName);
				411
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	412	StringRef Name(BufferPtr + 1, TextBegin - (BufferPtr + 1));
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	413	formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	414	T.setVerbatimBlockName(Name);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	415
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	416	// If there is a newline following the verbatim opening command, skip the
				417	// newline so that we don't create an tok::verbatim_block_line with empty
				418	// text content.
				419	if (BufferPtr != CommentEnd) {
				420	const char C = *BufferPtr;
				421	if (C == '\n' \|\| C == '\r') {
				422	BufferPtr = skipNewline(BufferPtr, CommentEnd);
				423	State = LS_VerbatimBlockBody;
				424	return;
				425	}
				426	}
				427
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	428	State = LS_VerbatimBlockFirstLine;
				429	}
				430
				431	void Lexer::lexVerbatimBlockFirstLine(Token &T) {
				432	assert(BufferPtr < CommentEnd);
				433
				434	// FIXME: It would be better to scan the text once, finding either the block
				435	// end command or newline.
				436	//
				437	// Extract current line.
				438	const char *Newline = findNewline(BufferPtr, CommentEnd);
				439	StringRef Line(BufferPtr, Newline - BufferPtr);
				440
				441	// Look for end command in current line.
				442	size_t Pos = Line.find(VerbatimBlockEndCommandName);
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	443	const char *TextEnd;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	444	const char *NextLine;
				445	if (Pos == StringRef::npos) {
				446	// Current line is completely verbatim.
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	447	TextEnd = Newline;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	448	NextLine = skipNewline(Newline, CommentEnd);
				449	} else if (Pos == 0) {
				450	// Current line contains just an end command.
				451	const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	452	StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	453	formTokenWithChars(T, End, tok::verbatim_block_end);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	454	T.setVerbatimBlockName(Name);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	455	State = LS_Normal;
				456	return;
				457	} else {
				458	// There is some text, followed by end command. Extract text first.
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	459	TextEnd = BufferPtr + Pos;
				460	NextLine = TextEnd;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	461	}
				462
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	463	StringRef Text(BufferPtr, TextEnd - BufferPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	464	formTokenWithChars(T, NextLine, tok::verbatim_block_line);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	465	T.setVerbatimBlockText(Text);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	466
				467	State = LS_VerbatimBlockBody;
				468	}
				469
				470	void Lexer::lexVerbatimBlockBody(Token &T) {
				471	assert(State == LS_VerbatimBlockBody);
				472
				473	if (CommentState == LCS_InsideCComment)
				474	skipLineStartingDecorations();
				475
				476	lexVerbatimBlockFirstLine(T);
				477	}
				478
Dmitri Gribenko	962668d	2012-06-27 16:53:58 +0000	[diff] [blame]	479	void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin) {
				480	const StringRef Name(BufferPtr + 1, TextBegin - BufferPtr - 1);
				481	formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
				482	T.setVerbatimLineName(Name);
				483
				484	State = LS_VerbatimLineText;
				485	}
				486
				487	void Lexer::lexVerbatimLineText(Token &T) {
				488	assert(State == LS_VerbatimLineText);
				489
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	490	// Extract current line.
				491	const char *Newline = findNewline(BufferPtr, CommentEnd);
Dmitri Gribenko	962668d	2012-06-27 16:53:58 +0000	[diff] [blame]	492	const StringRef Text(BufferPtr, Newline - BufferPtr);
				493	formTokenWithChars(T, Newline, tok::verbatim_line_text);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	494	T.setVerbatimLineText(Text);
Dmitri Gribenko	962668d	2012-06-27 16:53:58 +0000	[diff] [blame]	495
				496	State = LS_Normal;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	497	}
				498
				499	void Lexer::setupAndLexHTMLOpenTag(Token &T) {
Dmitri Gribenko	a99ec10	2012-07-09 21:32:40 +0000	[diff] [blame]	500	assert(BufferPtr[0] == '<' &&
				501	isHTMLIdentifierStartingCharacter(BufferPtr[1]));
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	502	const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
				503
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	504	StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	505	formTokenWithChars(T, TagNameEnd, tok::html_tag_open);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	506	T.setHTMLTagOpenName(Name);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	507
				508	BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
				509
Dmitri Gribenko	a99ec10	2012-07-09 21:32:40 +0000	[diff] [blame]	510	const char C = *BufferPtr;
				511	if (BufferPtr != CommentEnd &&
Dmitri Gribenko	a5ef44f	2012-07-11 21:38:39 +0000	[diff] [blame]	512	(C == '>' \|\| C == '/' \|\| isHTMLIdentifierStartingCharacter(C)))
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	513	State = LS_HTMLOpenTag;
				514	}
				515
				516	void Lexer::lexHTMLOpenTag(Token &T) {
				517	assert(State == LS_HTMLOpenTag);
				518
				519	const char *TokenPtr = BufferPtr;
				520	char C = *TokenPtr;
				521	if (isHTMLIdentifierCharacter(C)) {
				522	TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	523	StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	524	formTokenWithChars(T, TokenPtr, tok::html_ident);
Dmitri Gribenko	f5e0aea	2012-06-27 16:30:35 +0000	[diff] [blame]	525	T.setHTMLIdent(Ident);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	526	} else {
				527	switch (C) {
				528	case '=':
				529	TokenPtr++;
				530	formTokenWithChars(T, TokenPtr, tok::html_equals);
				531	break;
				532	case '\"':
				533	case '\'': {
				534	const char *OpenQuote = TokenPtr;
				535	TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
				536	const char *ClosingQuote = TokenPtr;
				537	if (TokenPtr != CommentEnd) // Skip closing quote.
				538	TokenPtr++;
				539	formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
				540	T.setHTMLQuotedString(StringRef(OpenQuote + 1,
				541	ClosingQuote - (OpenQuote + 1)));
				542	break;
				543	}
				544	case '>':
				545	TokenPtr++;
				546	formTokenWithChars(T, TokenPtr, tok::html_greater);
Dmitri Gribenko	a99ec10	2012-07-09 21:32:40 +0000	[diff] [blame]	547	State = LS_Normal;
				548	return;
Dmitri Gribenko	a5ef44f	2012-07-11 21:38:39 +0000	[diff] [blame]	549	case '/':
				550	TokenPtr++;
				551	if (TokenPtr != CommentEnd && *TokenPtr == '>') {
				552	TokenPtr++;
				553	formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
				554	} else {
				555	StringRef Text(BufferPtr, TokenPtr - BufferPtr);
				556	formTokenWithChars(T, TokenPtr, tok::text);
				557	T.setText(Text);
				558	}
				559	State = LS_Normal;
				560	return;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	561	}
				562	}
				563
				564	// Now look ahead and return to normal state if we don't see any HTML tokens
				565	// ahead.
				566	BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
				567	if (BufferPtr == CommentEnd) {
				568	State = LS_Normal;
				569	return;
				570	}
				571
				572	C = *BufferPtr;
Dmitri Gribenko	a99ec10	2012-07-09 21:32:40 +0000	[diff] [blame]	573	if (!isHTMLIdentifierStartingCharacter(C) &&
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	574	C != '=' && C != '\"' && C != '\'' && C != '>') {
				575	State = LS_Normal;
				576	return;
				577	}
				578	}
				579
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	580	void Lexer::setupAndLexHTMLCloseTag(Token &T) {
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	581	assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
				582
				583	const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
				584	const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
				585
				586	const char *End = skipWhitespace(TagNameEnd, CommentEnd);
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	587
				588	formTokenWithChars(T, End, tok::html_tag_close);
				589	T.setHTMLTagCloseName(StringRef(TagNameBegin, TagNameEnd - TagNameBegin));
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	590
				591	if (BufferPtr != CommentEnd && *BufferPtr == '>')
				592	State = LS_HTMLCloseTag;
				593	}
				594
				595	void Lexer::lexHTMLCloseTag(Token &T) {
				596	assert(BufferPtr != CommentEnd && *BufferPtr == '>');
				597
				598	formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
				599	State = LS_Normal;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	600	}
				601
				602	Lexer::Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts,
				603	const char BufferStart, const char BufferEnd):
				604	BufferStart(BufferStart), BufferEnd(BufferEnd),
				605	FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart),
				606	CommentState(LCS_BeforeComment), State(LS_Normal) {
				607	}
				608
				609	void Lexer::lex(Token &T) {
				610	again:
				611	switch (CommentState) {
				612	case LCS_BeforeComment:
				613	if (BufferPtr == BufferEnd) {
				614	formTokenWithChars(T, BufferPtr, tok::eof);
				615	return;
				616	}
				617
				618	assert(*BufferPtr == '/');
				619	BufferPtr++; // Skip first slash.
				620	switch(*BufferPtr) {
				621	case '/': { // BCPL comment.
				622	BufferPtr++; // Skip second slash.
				623
				624	if (BufferPtr != BufferEnd) {
				625	// Skip Doxygen magic marker, if it is present.
				626	// It might be missing because of a typo //< or /*<, or because we
				627	// merged this non-Doxygen comment into a bunch of Doxygen comments
				628	// around it: /** ... / / ... / /* ... */
				629	const char C = *BufferPtr;
				630	if (C == '/' \|\| C == '!')
				631	BufferPtr++;
				632	}
				633
				634	// Skip less-than symbol that marks trailing comments.
				635	// Skip it even if the comment is not a Doxygen one, because //< and /*<
				636	// are frequent typos.
				637	if (BufferPtr != BufferEnd && *BufferPtr == '<')
				638	BufferPtr++;
				639
				640	CommentState = LCS_InsideBCPLComment;
Dmitri Gribenko	8d3ba23	2012-07-06 00:28:32 +0000	[diff] [blame]	641	if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
				642	State = LS_Normal;
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	643	CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
				644	goto again;
				645	}
				646	case '*': { // C comment.
				647	BufferPtr++; // Skip star.
				648
				649	// Skip Doxygen magic marker.
				650	const char C = *BufferPtr;
				651	if ((C == '' && (BufferPtr + 1) != '/') \|\| C == '!')
				652	BufferPtr++;
				653
				654	// Skip less-than symbol that marks trailing comments.
				655	if (BufferPtr != BufferEnd && *BufferPtr == '<')
				656	BufferPtr++;
				657
				658	CommentState = LCS_InsideCComment;
				659	State = LS_Normal;
				660	CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
				661	goto again;
				662	}
				663	default:
				664	llvm_unreachable("second character of comment should be '/' or '*'");
				665	}
				666
				667	case LCS_BetweenComments: {
				668	// Consecutive comments are extracted only if there is only whitespace
				669	// between them. So we can search for the start of the next comment.
				670	const char *EndWhitespace = BufferPtr;
				671	while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
				672	EndWhitespace++;
				673
				674	// Turn any whitespace between comments (and there is only whitespace
Dmitri Gribenko	a99ec10	2012-07-09 21:32:40 +0000	[diff] [blame]	675	// between them -- guaranteed by comment extraction) into a newline. We
				676	// have two newlines between C comments in total (first one was synthesized
				677	// after a comment).
Dmitri Gribenko	2d44d77	2012-06-26 20:39:18 +0000	[diff] [blame]	678	formTokenWithChars(T, EndWhitespace, tok::newline);
				679
				680	CommentState = LCS_BeforeComment;
				681	break;
				682	}
				683
				684	case LCS_InsideBCPLComment:
				685	case LCS_InsideCComment:
				686	if (BufferPtr != CommentEnd) {
				687	lexCommentText(T);
				688	break;
				689	} else {
				690	// Skip C comment closing sequence.
				691	if (CommentState == LCS_InsideCComment) {
				692	assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
				693	BufferPtr += 2;
				694	assert(BufferPtr <= BufferEnd);
				695
				696	// Synthenize newline just after the C comment, regardless if there is
				697	// actually a newline.
				698	formTokenWithChars(T, BufferPtr, tok::newline);
				699
				700	CommentState = LCS_BetweenComments;
				701	break;
				702	} else {
				703	// Don't synthesized a newline after BCPL comment.
				704	CommentState = LCS_BetweenComments;
				705	goto again;
				706	}
				707	}
				708	}
				709	}
				710
				711	StringRef Lexer::getSpelling(const Token &Tok,
				712	const SourceManager &SourceMgr,
				713	bool *Invalid) const {
				714	SourceLocation Loc = Tok.getLocation();
				715	std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
				716
				717	bool InvalidTemp = false;
				718	StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
				719	if (InvalidTemp) {
				720	*Invalid = true;
				721	return StringRef();
				722	}
				723
				724	const char *Begin = File.data() + LocInfo.second;
				725	return StringRef(Begin, Tok.getLength());
				726	}
				727
				728	void Lexer::addVerbatimBlockCommand(StringRef BeginName, StringRef EndName) {
				729	VerbatimBlockCommand VBC;
				730	VBC.BeginName = BeginName;
				731	VBC.EndName = EndName;
				732	VerbatimBlockCommands.push_back(VBC);
				733	}
				734
				735	void Lexer::addVerbatimLineCommand(StringRef Name) {
				736	VerbatimLineCommand VLC;
				737	VLC.Name = Name;
				738	VerbatimLineCommands.push_back(VLC);
				739	}
				740
				741	} // end namespace comments
				742	} // end namespace clang
				743