blob: 557a343b435869e2b60b9350594f3fad18a03940 [file] [log] [blame]
Marek Sokolowski719e22d2017-08-10 16:21:44 +00001//===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===---------------------------------------------------------------------===//
9//
10// This file implements an interface defined in ResourceScriptToken.h.
11// In particular, it defines an .rc script tokenizer.
12//
13//===---------------------------------------------------------------------===//
14
15#include "ResourceScriptToken.h"
16#include "llvm/Support/raw_ostream.h"
17
18#include <algorithm>
19#include <cassert>
20#include <cctype>
21#include <cstdlib>
22#include <utility>
23
24using namespace llvm;
25
26using Kind = RCToken::Kind;
27
28// Checks if Representation is a correct description of an RC integer.
29// It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
30// or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
31// character (that is the difference between our representation and
32// StringRef's one). If Representation is correct, 'true' is returned and
33// the return value is put back in Num.
34static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
35 size_t Length = Representation.size();
36 if (Length == 0)
37 return false;
38 // Strip the last 'L' if unnecessary.
39 if (std::toupper(Representation.back()) == 'L')
40 Representation = Representation.drop_back(1);
41
42 return !Representation.getAsInteger<uint32_t>(0, Num);
43}
44
45RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
46 : TokenKind(RCTokenKind), TokenValue(Value) {}
47
48uint32_t RCToken::intValue() const {
49 assert(TokenKind == Kind::Int);
50 // We assume that the token already is a correct integer (checked by
51 // rcGetAsInteger).
52 uint32_t Result;
53 bool IsSuccess = rcGetAsInteger(TokenValue, Result);
54 assert(IsSuccess);
55 (void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on.
56 return Result;
57}
58
59StringRef RCToken::value() const { return TokenValue; }
60
61Kind RCToken::kind() const { return TokenKind; }
62
63static Error getStringError(const Twine &message) {
64 return make_error<StringError>("Error parsing file: " + message,
65 inconvertibleErrorCode());
66}
67
68namespace {
69
70class Tokenizer {
71public:
72 Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {}
73
74 Expected<std::vector<RCToken>> run();
75
76private:
77 // All 'advancing' methods return boolean values; if they're equal to false,
78 // the stream has ended or failed.
79 bool advance(size_t Amount = 1);
80 bool skipWhitespaces();
81
82 // Consumes a token. If any problem occurred, a non-empty Error is returned.
83 Error consumeToken(const Kind TokenKind);
84
85 // Check if tokenizer is about to read FollowingChars.
86 bool willNowRead(StringRef FollowingChars) const;
87
88 // Check if tokenizer can start reading an identifier at current position.
89 // The original tool did non specify the rules to determine what is a correct
90 // identifier. We assume they should follow the C convention:
91 // [a-zA-z_][a-zA-Z0-9_]*.
92 bool canStartIdentifier() const;
93 // Check if tokenizer can continue reading an identifier.
94 bool canContinueIdentifier() const;
95
96 // Check if tokenizer can start reading an integer.
97 // A correct integer always starts with a 0-9 digit,
98 // can contain characters 0-9A-Fa-f (digits),
99 // Ll (marking the integer is 32-bit), Xx (marking the representation
100 // is hexadecimal). As some kind of separator should come after the
101 // integer, we can consume the integer until a non-alphanumeric
102 // character.
103 bool canStartInt() const;
104 bool canContinueInt() const;
105
106 bool canStartString() const;
107
108 bool streamEof() const;
109
110 // Classify the token that is about to be read from the current position.
111 Kind classifyCurrentToken() const;
112
113 // Process the Kind::Identifier token - check if it is
114 // an identifier describing a block start or end.
115 void processIdentifier(RCToken &token) const;
116
117 StringRef Data;
118 size_t DataLength, Pos;
119};
120
121Expected<std::vector<RCToken>> Tokenizer::run() {
122 Pos = 0;
123 std::vector<RCToken> Result;
124
125 // Consume an optional UTF-8 Byte Order Mark.
126 if (willNowRead("\xef\xbb\xbf"))
127 advance(3);
128
129 while (!streamEof()) {
130 if (!skipWhitespaces())
131 break;
132
133 Kind TokenKind = classifyCurrentToken();
134 if (TokenKind == Kind::Invalid)
135 return getStringError("Invalid token found at position " + Twine(Pos));
136
137 const size_t TokenStart = Pos;
138 if (Error TokenError = consumeToken(TokenKind))
139 return std::move(TokenError);
140
141 RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
142 if (TokenKind == Kind::Identifier) {
143 processIdentifier(Token);
144 } else if (TokenKind == Kind::Int) {
145 uint32_t TokenInt;
146 if (!rcGetAsInteger(Token.value(), TokenInt)) {
147 // The integer has incorrect format or cannot be represented in
148 // a 32-bit integer.
149 return getStringError("Integer invalid or too large: " +
150 Token.value().str());
151 }
152 }
153
154 Result.push_back(Token);
155 }
156
157 return Result;
158}
159
160bool Tokenizer::advance(size_t Amount) {
161 Pos += Amount;
162 return !streamEof();
163}
164
165bool Tokenizer::skipWhitespaces() {
166 while (!streamEof() && std::isspace(Data[Pos]))
167 advance();
168 return !streamEof();
169}
170
171Error Tokenizer::consumeToken(const Kind TokenKind) {
172 switch (TokenKind) {
173 // One-character token consumption.
174#define TOKEN(Name)
175#define SHORT_TOKEN(Name, Ch) case Kind::Name:
176#include "ResourceScriptTokenList.h"
177#undef TOKEN
178#undef SHORT_TOKEN
179 advance();
180 return Error::success();
181
182 case Kind::Identifier:
183 while (!streamEof() && canContinueIdentifier())
184 advance();
185 return Error::success();
186
187 case Kind::Int:
188 while (!streamEof() && canContinueInt())
189 advance();
190 return Error::success();
191
192 case Kind::String:
193 // Consume the preceding 'L', if there is any.
194 if (std::toupper(Data[Pos]) == 'L')
195 advance();
196 // Consume the double-quote.
197 advance();
198
199 // Consume the characters until the end of the file, line or string.
200 while (true) {
201 if (streamEof()) {
202 return getStringError("Unterminated string literal.");
203 } else if (Data[Pos] == '"') {
204 // Consume the ending double-quote.
205 advance();
206 return Error::success();
207 } else if (Data[Pos] == '\n') {
208 return getStringError("String literal not terminated in the line.");
209 }
210
211 advance();
212 }
213
214 case Kind::Invalid:
215 assert(false && "Cannot consume an invalid token.");
216 }
Marek Sokolowskid0c5bfa2017-08-10 16:46:52 +0000217
218 // This silences the compilers which cannot notice that the execution
219 // never reaches here.
220 assert(false);
Marek Sokolowski719e22d2017-08-10 16:21:44 +0000221}
222
223bool Tokenizer::willNowRead(StringRef FollowingChars) const {
224 return Data.drop_front(Pos).startswith(FollowingChars);
225}
226
227bool Tokenizer::canStartIdentifier() const {
228 assert(!streamEof());
229
230 const char CurChar = Data[Pos];
231 return std::isalpha(CurChar) || CurChar == '_';
232}
233
234bool Tokenizer::canContinueIdentifier() const {
235 assert(!streamEof());
236 const char CurChar = Data[Pos];
237 return std::isalnum(CurChar) || CurChar == '_';
238}
239
240bool Tokenizer::canStartInt() const {
241 assert(!streamEof());
242 return std::isdigit(Data[Pos]);
243}
244
245bool Tokenizer::canContinueInt() const {
246 assert(!streamEof());
247 return std::isalnum(Data[Pos]);
248}
249
250bool Tokenizer::canStartString() const {
251 return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
252}
253
254bool Tokenizer::streamEof() const { return Pos == DataLength; }
255
256Kind Tokenizer::classifyCurrentToken() const {
257 if (canStartInt())
258 return Kind::Int;
259 if (canStartString())
260 return Kind::String;
261 // BEGIN and END are at this point of lexing recognized as identifiers.
262 if (canStartIdentifier())
263 return Kind::Identifier;
264
265 const char CurChar = Data[Pos];
266
267 switch (CurChar) {
268 // One-character token classification.
269#define TOKEN(Name)
270#define SHORT_TOKEN(Name, Ch) \
271 case Ch: \
272 return Kind::Name;
273#include "ResourceScriptTokenList.h"
274#undef TOKEN
275#undef SHORT_TOKEN
276
277 default:
278 return Kind::Invalid;
279 }
280}
281
282void Tokenizer::processIdentifier(RCToken &Token) const {
283 assert(Token.kind() == Kind::Identifier);
284 StringRef Name = Token.value();
285
286 if (Name.equals_lower("begin"))
287 Token = RCToken(Kind::BlockBegin, Name);
288 else if (Name.equals_lower("end"))
289 Token = RCToken(Kind::BlockEnd, Name);
290}
291
292} // anonymous namespace
293
294namespace llvm {
295
296Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
297 return Tokenizer(Input).run();
298}
299
300} // namespace llvm