Blame - llvm/tools/llvm-rc/ResourceScriptToken.cpp - toolchain/llvm-project

blob: 557a343b435869e2b60b9350594f3fad18a03940 [file] [log] [blame]

Marek Sokolowski	719e22d	2017-08-10 16:21:44 +0000	[diff] [blame]	1	//===-- ResourceScriptToken.cpp ---------------------------------- C++--===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===---------------------------------------------------------------------===//
				9	//
				10	// This file implements an interface defined in ResourceScriptToken.h.
				11	// In particular, it defines an .rc script tokenizer.
				12	//
				13	//===---------------------------------------------------------------------===//
				14
				15	#include "ResourceScriptToken.h"
				16	#include "llvm/Support/raw_ostream.h"
				17
				18	#include <algorithm>
				19	#include <cassert>
				20	#include <cctype>
				21	#include <cstdlib>
				22	#include <utility>
				23
				24	using namespace llvm;
				25
				26	using Kind = RCToken::Kind;
				27
				28	// Checks if Representation is a correct description of an RC integer.
				29	// It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
				30	// or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
				31	// character (that is the difference between our representation and
				32	// StringRef's one). If Representation is correct, 'true' is returned and
				33	// the return value is put back in Num.
				34	static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
				35	size_t Length = Representation.size();
				36	if (Length == 0)
				37	return false;
				38	// Strip the last 'L' if unnecessary.
				39	if (std::toupper(Representation.back()) == 'L')
				40	Representation = Representation.drop_back(1);
				41
				42	return !Representation.getAsInteger<uint32_t>(0, Num);
				43	}
				44
				45	RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
				46	: TokenKind(RCTokenKind), TokenValue(Value) {}
				47
				48	uint32_t RCToken::intValue() const {
				49	assert(TokenKind == Kind::Int);
				50	// We assume that the token already is a correct integer (checked by
				51	// rcGetAsInteger).
				52	uint32_t Result;
				53	bool IsSuccess = rcGetAsInteger(TokenValue, Result);
				54	assert(IsSuccess);
				55	(void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on.
				56	return Result;
				57	}
				58
				59	StringRef RCToken::value() const { return TokenValue; }
				60
				61	Kind RCToken::kind() const { return TokenKind; }
				62
				63	static Error getStringError(const Twine &message) {
				64	return make_error<StringError>("Error parsing file: " + message,
				65	inconvertibleErrorCode());
				66	}
				67
				68	namespace {
				69
				70	class Tokenizer {
				71	public:
				72	Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {}
				73
				74	Expected<std::vector<RCToken>> run();
				75
				76	private:
				77	// All 'advancing' methods return boolean values; if they're equal to false,
				78	// the stream has ended or failed.
				79	bool advance(size_t Amount = 1);
				80	bool skipWhitespaces();
				81
				82	// Consumes a token. If any problem occurred, a non-empty Error is returned.
				83	Error consumeToken(const Kind TokenKind);
				84
				85	// Check if tokenizer is about to read FollowingChars.
				86	bool willNowRead(StringRef FollowingChars) const;
				87
				88	// Check if tokenizer can start reading an identifier at current position.
				89	// The original tool did non specify the rules to determine what is a correct
				90	// identifier. We assume they should follow the C convention:
				91	// [a-zA-z_][a-zA-Z0-9_]*.
				92	bool canStartIdentifier() const;
				93	// Check if tokenizer can continue reading an identifier.
				94	bool canContinueIdentifier() const;
				95
				96	// Check if tokenizer can start reading an integer.
				97	// A correct integer always starts with a 0-9 digit,
				98	// can contain characters 0-9A-Fa-f (digits),
				99	// Ll (marking the integer is 32-bit), Xx (marking the representation
				100	// is hexadecimal). As some kind of separator should come after the
				101	// integer, we can consume the integer until a non-alphanumeric
				102	// character.
				103	bool canStartInt() const;
				104	bool canContinueInt() const;
				105
				106	bool canStartString() const;
				107
				108	bool streamEof() const;
				109
				110	// Classify the token that is about to be read from the current position.
				111	Kind classifyCurrentToken() const;
				112
				113	// Process the Kind::Identifier token - check if it is
				114	// an identifier describing a block start or end.
				115	void processIdentifier(RCToken &token) const;
				116
				117	StringRef Data;
				118	size_t DataLength, Pos;
				119	};
				120
				121	Expected<std::vector<RCToken>> Tokenizer::run() {
				122	Pos = 0;
				123	std::vector<RCToken> Result;
				124
				125	// Consume an optional UTF-8 Byte Order Mark.
				126	if (willNowRead("\xef\xbb\xbf"))
				127	advance(3);
				128
				129	while (!streamEof()) {
				130	if (!skipWhitespaces())
				131	break;
				132
				133	Kind TokenKind = classifyCurrentToken();
				134	if (TokenKind == Kind::Invalid)
				135	return getStringError("Invalid token found at position " + Twine(Pos));
				136
				137	const size_t TokenStart = Pos;
				138	if (Error TokenError = consumeToken(TokenKind))
				139	return std::move(TokenError);
				140
				141	RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
				142	if (TokenKind == Kind::Identifier) {
				143	processIdentifier(Token);
				144	} else if (TokenKind == Kind::Int) {
				145	uint32_t TokenInt;
				146	if (!rcGetAsInteger(Token.value(), TokenInt)) {
				147	// The integer has incorrect format or cannot be represented in
				148	// a 32-bit integer.
				149	return getStringError("Integer invalid or too large: " +
				150	Token.value().str());
				151	}
				152	}
				153
				154	Result.push_back(Token);
				155	}
				156
				157	return Result;
				158	}
				159
				160	bool Tokenizer::advance(size_t Amount) {
				161	Pos += Amount;
				162	return !streamEof();
				163	}
				164
				165	bool Tokenizer::skipWhitespaces() {
				166	while (!streamEof() && std::isspace(Data[Pos]))
				167	advance();
				168	return !streamEof();
				169	}
				170
				171	Error Tokenizer::consumeToken(const Kind TokenKind) {
				172	switch (TokenKind) {
				173	// One-character token consumption.
				174	#define TOKEN(Name)
				175	#define SHORT_TOKEN(Name, Ch) case Kind::Name:
				176	#include "ResourceScriptTokenList.h"
				177	#undef TOKEN
				178	#undef SHORT_TOKEN
				179	advance();
				180	return Error::success();
				181
				182	case Kind::Identifier:
				183	while (!streamEof() && canContinueIdentifier())
				184	advance();
				185	return Error::success();
				186
				187	case Kind::Int:
				188	while (!streamEof() && canContinueInt())
				189	advance();
				190	return Error::success();
				191
				192	case Kind::String:
				193	// Consume the preceding 'L', if there is any.
				194	if (std::toupper(Data[Pos]) == 'L')
				195	advance();
				196	// Consume the double-quote.
				197	advance();
				198
				199	// Consume the characters until the end of the file, line or string.
				200	while (true) {
				201	if (streamEof()) {
				202	return getStringError("Unterminated string literal.");
				203	} else if (Data[Pos] == '"') {
				204	// Consume the ending double-quote.
				205	advance();
				206	return Error::success();
				207	} else if (Data[Pos] == '\n') {
				208	return getStringError("String literal not terminated in the line.");
				209	}
				210
				211	advance();
				212	}
				213
				214	case Kind::Invalid:
				215	assert(false && "Cannot consume an invalid token.");
				216	}
Marek Sokolowski	d0c5bfa	2017-08-10 16:46:52 +0000	[diff] [blame^]	217
				218	// This silences the compilers which cannot notice that the execution
				219	// never reaches here.
				220	assert(false);
Marek Sokolowski	719e22d	2017-08-10 16:21:44 +0000	[diff] [blame]	221	}
				222
				223	bool Tokenizer::willNowRead(StringRef FollowingChars) const {
				224	return Data.drop_front(Pos).startswith(FollowingChars);
				225	}
				226
				227	bool Tokenizer::canStartIdentifier() const {
				228	assert(!streamEof());
				229
				230	const char CurChar = Data[Pos];
				231	return std::isalpha(CurChar) \|\| CurChar == '_';
				232	}
				233
				234	bool Tokenizer::canContinueIdentifier() const {
				235	assert(!streamEof());
				236	const char CurChar = Data[Pos];
				237	return std::isalnum(CurChar) \|\| CurChar == '_';
				238	}
				239
				240	bool Tokenizer::canStartInt() const {
				241	assert(!streamEof());
				242	return std::isdigit(Data[Pos]);
				243	}
				244
				245	bool Tokenizer::canContinueInt() const {
				246	assert(!streamEof());
				247	return std::isalnum(Data[Pos]);
				248	}
				249
				250	bool Tokenizer::canStartString() const {
				251	return willNowRead("\"") \|\| willNowRead("L\"") \|\| willNowRead("l\"");
				252	}
				253
				254	bool Tokenizer::streamEof() const { return Pos == DataLength; }
				255
				256	Kind Tokenizer::classifyCurrentToken() const {
				257	if (canStartInt())
				258	return Kind::Int;
				259	if (canStartString())
				260	return Kind::String;
				261	// BEGIN and END are at this point of lexing recognized as identifiers.
				262	if (canStartIdentifier())
				263	return Kind::Identifier;
				264
				265	const char CurChar = Data[Pos];
				266
				267	switch (CurChar) {
				268	// One-character token classification.
				269	#define TOKEN(Name)
				270	#define SHORT_TOKEN(Name, Ch) \
				271	case Ch: \
				272	return Kind::Name;
				273	#include "ResourceScriptTokenList.h"
				274	#undef TOKEN
				275	#undef SHORT_TOKEN
				276
				277	default:
				278	return Kind::Invalid;
				279	}
				280	}
				281
				282	void Tokenizer::processIdentifier(RCToken &Token) const {
				283	assert(Token.kind() == Kind::Identifier);
				284	StringRef Name = Token.value();
				285
				286	if (Name.equals_lower("begin"))
				287	Token = RCToken(Kind::BlockBegin, Name);
				288	else if (Name.equals_lower("end"))
				289	Token = RCToken(Kind::BlockEnd, Name);
				290	}
				291
				292	} // anonymous namespace
				293
				294	namespace llvm {
				295
				296	Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
				297	return Tokenizer(Input).run();
				298	}
				299
				300	} // namespace llvm