Blame - llvm/tools/llvm-rc/ResourceScriptToken.cpp - toolchain/llvm-project

blob: 36027d14ba06b92118c21585e799cd3b47304562 [file] [log] [blame]

Marek Sokolowski	719e22d	2017-08-10 16:21:44 +0000	[diff] [blame]	1	//===-- ResourceScriptToken.cpp ---------------------------------- C++--===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===---------------------------------------------------------------------===//
				9	//
				10	// This file implements an interface defined in ResourceScriptToken.h.
				11	// In particular, it defines an .rc script tokenizer.
				12	//
				13	//===---------------------------------------------------------------------===//
				14
				15	#include "ResourceScriptToken.h"
				16	#include "llvm/Support/raw_ostream.h"
				17
				18	#include <algorithm>
				19	#include <cassert>
				20	#include <cctype>
				21	#include <cstdlib>
				22	#include <utility>
				23
				24	using namespace llvm;
				25
				26	using Kind = RCToken::Kind;
				27
				28	// Checks if Representation is a correct description of an RC integer.
				29	// It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
				30	// or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
				31	// character (that is the difference between our representation and
				32	// StringRef's one). If Representation is correct, 'true' is returned and
				33	// the return value is put back in Num.
				34	static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
				35	size_t Length = Representation.size();
				36	if (Length == 0)
				37	return false;
				38	// Strip the last 'L' if unnecessary.
				39	if (std::toupper(Representation.back()) == 'L')
				40	Representation = Representation.drop_back(1);
				41
				42	return !Representation.getAsInteger<uint32_t>(0, Num);
				43	}
				44
				45	RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
				46	: TokenKind(RCTokenKind), TokenValue(Value) {}
				47
				48	uint32_t RCToken::intValue() const {
				49	assert(TokenKind == Kind::Int);
				50	// We assume that the token already is a correct integer (checked by
				51	// rcGetAsInteger).
				52	uint32_t Result;
				53	bool IsSuccess = rcGetAsInteger(TokenValue, Result);
				54	assert(IsSuccess);
				55	(void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on.
				56	return Result;
				57	}
				58
Zachary Turner	07bc04f	2017-10-06 21:26:06 +0000	[diff] [blame^]	59	bool RCToken::isLongInt() const {
				60	return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L';
				61	}
				62
Marek Sokolowski	719e22d	2017-08-10 16:21:44 +0000	[diff] [blame]	63	StringRef RCToken::value() const { return TokenValue; }
				64
				65	Kind RCToken::kind() const { return TokenKind; }
				66
Marek Sokolowski	7e89ee7	2017-09-28 23:53:25 +0000	[diff] [blame]	67	bool RCToken::isBinaryOp() const {
				68	switch (TokenKind) {
				69	case Kind::Plus:
				70	case Kind::Minus:
				71	case Kind::Pipe:
				72	case Kind::Amp:
				73	return true;
				74	default:
				75	return false;
				76	}
				77	}
				78
Marek Sokolowski	719e22d	2017-08-10 16:21:44 +0000	[diff] [blame]	79	static Error getStringError(const Twine &message) {
				80	return make_error<StringError>("Error parsing file: " + message,
				81	inconvertibleErrorCode());
				82	}
				83
				84	namespace {
				85
				86	class Tokenizer {
				87	public:
				88	Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {}
				89
				90	Expected<std::vector<RCToken>> run();
				91
				92	private:
				93	// All 'advancing' methods return boolean values; if they're equal to false,
				94	// the stream has ended or failed.
				95	bool advance(size_t Amount = 1);
				96	bool skipWhitespaces();
				97
				98	// Consumes a token. If any problem occurred, a non-empty Error is returned.
				99	Error consumeToken(const Kind TokenKind);
				100
				101	// Check if tokenizer is about to read FollowingChars.
				102	bool willNowRead(StringRef FollowingChars) const;
				103
				104	// Check if tokenizer can start reading an identifier at current position.
				105	// The original tool did non specify the rules to determine what is a correct
				106	// identifier. We assume they should follow the C convention:
Benjamin Kramer	b04d84c	2017-09-07 09:54:03 +0000	[diff] [blame]	107	// [a-zA-Z_][a-zA-Z0-9_]*.
Marek Sokolowski	719e22d	2017-08-10 16:21:44 +0000	[diff] [blame]	108	bool canStartIdentifier() const;
				109	// Check if tokenizer can continue reading an identifier.
				110	bool canContinueIdentifier() const;
				111
				112	// Check if tokenizer can start reading an integer.
				113	// A correct integer always starts with a 0-9 digit,
				114	// can contain characters 0-9A-Fa-f (digits),
				115	// Ll (marking the integer is 32-bit), Xx (marking the representation
				116	// is hexadecimal). As some kind of separator should come after the
				117	// integer, we can consume the integer until a non-alphanumeric
				118	// character.
				119	bool canStartInt() const;
				120	bool canContinueInt() const;
				121
				122	bool canStartString() const;
				123
				124	bool streamEof() const;
				125
				126	// Classify the token that is about to be read from the current position.
				127	Kind classifyCurrentToken() const;
				128
				129	// Process the Kind::Identifier token - check if it is
				130	// an identifier describing a block start or end.
				131	void processIdentifier(RCToken &token) const;
				132
				133	StringRef Data;
				134	size_t DataLength, Pos;
				135	};
				136
				137	Expected<std::vector<RCToken>> Tokenizer::run() {
				138	Pos = 0;
				139	std::vector<RCToken> Result;
				140
				141	// Consume an optional UTF-8 Byte Order Mark.
				142	if (willNowRead("\xef\xbb\xbf"))
				143	advance(3);
				144
				145	while (!streamEof()) {
				146	if (!skipWhitespaces())
				147	break;
				148
				149	Kind TokenKind = classifyCurrentToken();
				150	if (TokenKind == Kind::Invalid)
				151	return getStringError("Invalid token found at position " + Twine(Pos));
				152
				153	const size_t TokenStart = Pos;
				154	if (Error TokenError = consumeToken(TokenKind))
				155	return std::move(TokenError);
				156
				157	RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
				158	if (TokenKind == Kind::Identifier) {
				159	processIdentifier(Token);
				160	} else if (TokenKind == Kind::Int) {
				161	uint32_t TokenInt;
				162	if (!rcGetAsInteger(Token.value(), TokenInt)) {
				163	// The integer has incorrect format or cannot be represented in
				164	// a 32-bit integer.
				165	return getStringError("Integer invalid or too large: " +
				166	Token.value().str());
				167	}
				168	}
				169
				170	Result.push_back(Token);
				171	}
				172
				173	return Result;
				174	}
				175
				176	bool Tokenizer::advance(size_t Amount) {
				177	Pos += Amount;
				178	return !streamEof();
				179	}
				180
				181	bool Tokenizer::skipWhitespaces() {
				182	while (!streamEof() && std::isspace(Data[Pos]))
				183	advance();
				184	return !streamEof();
				185	}
				186
				187	Error Tokenizer::consumeToken(const Kind TokenKind) {
				188	switch (TokenKind) {
				189	// One-character token consumption.
				190	#define TOKEN(Name)
				191	#define SHORT_TOKEN(Name, Ch) case Kind::Name:
				192	#include "ResourceScriptTokenList.h"
				193	#undef TOKEN
				194	#undef SHORT_TOKEN
				195	advance();
				196	return Error::success();
				197
				198	case Kind::Identifier:
				199	while (!streamEof() && canContinueIdentifier())
				200	advance();
				201	return Error::success();
				202
				203	case Kind::Int:
				204	while (!streamEof() && canContinueInt())
				205	advance();
				206	return Error::success();
				207
				208	case Kind::String:
				209	// Consume the preceding 'L', if there is any.
				210	if (std::toupper(Data[Pos]) == 'L')
				211	advance();
				212	// Consume the double-quote.
				213	advance();
				214
				215	// Consume the characters until the end of the file, line or string.
				216	while (true) {
				217	if (streamEof()) {
				218	return getStringError("Unterminated string literal.");
				219	} else if (Data[Pos] == '"') {
				220	// Consume the ending double-quote.
				221	advance();
				222	return Error::success();
				223	} else if (Data[Pos] == '\n') {
				224	return getStringError("String literal not terminated in the line.");
				225	}
				226
				227	advance();
				228	}
				229
				230	case Kind::Invalid:
				231	assert(false && "Cannot consume an invalid token.");
				232	}
Marek Sokolowski	d0c5bfa	2017-08-10 16:46:52 +0000	[diff] [blame]	233
Simon Pilgrim	c3e546f	2017-08-10 17:20:09 +0000	[diff] [blame]	234	llvm_unreachable("Unknown RCToken::Kind");
Marek Sokolowski	719e22d	2017-08-10 16:21:44 +0000	[diff] [blame]	235	}
				236
				237	bool Tokenizer::willNowRead(StringRef FollowingChars) const {
				238	return Data.drop_front(Pos).startswith(FollowingChars);
				239	}
				240
				241	bool Tokenizer::canStartIdentifier() const {
				242	assert(!streamEof());
				243
				244	const char CurChar = Data[Pos];
				245	return std::isalpha(CurChar) \|\| CurChar == '_';
				246	}
				247
				248	bool Tokenizer::canContinueIdentifier() const {
				249	assert(!streamEof());
				250	const char CurChar = Data[Pos];
				251	return std::isalnum(CurChar) \|\| CurChar == '_';
				252	}
				253
				254	bool Tokenizer::canStartInt() const {
				255	assert(!streamEof());
				256	return std::isdigit(Data[Pos]);
				257	}
				258
				259	bool Tokenizer::canContinueInt() const {
				260	assert(!streamEof());
				261	return std::isalnum(Data[Pos]);
				262	}
				263
				264	bool Tokenizer::canStartString() const {
				265	return willNowRead("\"") \|\| willNowRead("L\"") \|\| willNowRead("l\"");
				266	}
				267
				268	bool Tokenizer::streamEof() const { return Pos == DataLength; }
				269
				270	Kind Tokenizer::classifyCurrentToken() const {
				271	if (canStartInt())
				272	return Kind::Int;
				273	if (canStartString())
				274	return Kind::String;
				275	// BEGIN and END are at this point of lexing recognized as identifiers.
				276	if (canStartIdentifier())
				277	return Kind::Identifier;
				278
				279	const char CurChar = Data[Pos];
				280
				281	switch (CurChar) {
				282	// One-character token classification.
				283	#define TOKEN(Name)
				284	#define SHORT_TOKEN(Name, Ch) \
				285	case Ch: \
				286	return Kind::Name;
				287	#include "ResourceScriptTokenList.h"
				288	#undef TOKEN
				289	#undef SHORT_TOKEN
				290
				291	default:
				292	return Kind::Invalid;
				293	}
				294	}
				295
				296	void Tokenizer::processIdentifier(RCToken &Token) const {
				297	assert(Token.kind() == Kind::Identifier);
				298	StringRef Name = Token.value();
				299
				300	if (Name.equals_lower("begin"))
				301	Token = RCToken(Kind::BlockBegin, Name);
				302	else if (Name.equals_lower("end"))
				303	Token = RCToken(Kind::BlockEnd, Name);
				304	}
				305
				306	} // anonymous namespace
				307
				308	namespace llvm {
				309
				310	Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
				311	return Tokenizer(Input).run();
				312	}
				313
				314	} // namespace llvm