blob: 36027d14ba06b92118c21585e799cd3b47304562 [file] [log] [blame]
Marek Sokolowski719e22d2017-08-10 16:21:44 +00001//===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===---------------------------------------------------------------------===//
9//
10// This file implements an interface defined in ResourceScriptToken.h.
11// In particular, it defines an .rc script tokenizer.
12//
13//===---------------------------------------------------------------------===//
14
15#include "ResourceScriptToken.h"
16#include "llvm/Support/raw_ostream.h"
17
18#include <algorithm>
19#include <cassert>
20#include <cctype>
21#include <cstdlib>
22#include <utility>
23
24using namespace llvm;
25
26using Kind = RCToken::Kind;
27
28// Checks if Representation is a correct description of an RC integer.
29// It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
30// or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
31// character (that is the difference between our representation and
32// StringRef's one). If Representation is correct, 'true' is returned and
33// the return value is put back in Num.
34static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
35 size_t Length = Representation.size();
36 if (Length == 0)
37 return false;
38 // Strip the last 'L' if unnecessary.
39 if (std::toupper(Representation.back()) == 'L')
40 Representation = Representation.drop_back(1);
41
42 return !Representation.getAsInteger<uint32_t>(0, Num);
43}
44
45RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
46 : TokenKind(RCTokenKind), TokenValue(Value) {}
47
48uint32_t RCToken::intValue() const {
49 assert(TokenKind == Kind::Int);
50 // We assume that the token already is a correct integer (checked by
51 // rcGetAsInteger).
52 uint32_t Result;
53 bool IsSuccess = rcGetAsInteger(TokenValue, Result);
54 assert(IsSuccess);
55 (void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on.
56 return Result;
57}
58
Zachary Turner07bc04f2017-10-06 21:26:06 +000059bool RCToken::isLongInt() const {
60 return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L';
61}
62
Marek Sokolowski719e22d2017-08-10 16:21:44 +000063StringRef RCToken::value() const { return TokenValue; }
64
65Kind RCToken::kind() const { return TokenKind; }
66
Marek Sokolowski7e89ee72017-09-28 23:53:25 +000067bool RCToken::isBinaryOp() const {
68 switch (TokenKind) {
69 case Kind::Plus:
70 case Kind::Minus:
71 case Kind::Pipe:
72 case Kind::Amp:
73 return true;
74 default:
75 return false;
76 }
77}
78
Marek Sokolowski719e22d2017-08-10 16:21:44 +000079static Error getStringError(const Twine &message) {
80 return make_error<StringError>("Error parsing file: " + message,
81 inconvertibleErrorCode());
82}
83
84namespace {
85
86class Tokenizer {
87public:
88 Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {}
89
90 Expected<std::vector<RCToken>> run();
91
92private:
93 // All 'advancing' methods return boolean values; if they're equal to false,
94 // the stream has ended or failed.
95 bool advance(size_t Amount = 1);
96 bool skipWhitespaces();
97
98 // Consumes a token. If any problem occurred, a non-empty Error is returned.
99 Error consumeToken(const Kind TokenKind);
100
101 // Check if tokenizer is about to read FollowingChars.
102 bool willNowRead(StringRef FollowingChars) const;
103
104 // Check if tokenizer can start reading an identifier at current position.
105 // The original tool did non specify the rules to determine what is a correct
106 // identifier. We assume they should follow the C convention:
Benjamin Kramerb04d84c2017-09-07 09:54:03 +0000107 // [a-zA-Z_][a-zA-Z0-9_]*.
Marek Sokolowski719e22d2017-08-10 16:21:44 +0000108 bool canStartIdentifier() const;
109 // Check if tokenizer can continue reading an identifier.
110 bool canContinueIdentifier() const;
111
112 // Check if tokenizer can start reading an integer.
113 // A correct integer always starts with a 0-9 digit,
114 // can contain characters 0-9A-Fa-f (digits),
115 // Ll (marking the integer is 32-bit), Xx (marking the representation
116 // is hexadecimal). As some kind of separator should come after the
117 // integer, we can consume the integer until a non-alphanumeric
118 // character.
119 bool canStartInt() const;
120 bool canContinueInt() const;
121
122 bool canStartString() const;
123
124 bool streamEof() const;
125
126 // Classify the token that is about to be read from the current position.
127 Kind classifyCurrentToken() const;
128
129 // Process the Kind::Identifier token - check if it is
130 // an identifier describing a block start or end.
131 void processIdentifier(RCToken &token) const;
132
133 StringRef Data;
134 size_t DataLength, Pos;
135};
136
137Expected<std::vector<RCToken>> Tokenizer::run() {
138 Pos = 0;
139 std::vector<RCToken> Result;
140
141 // Consume an optional UTF-8 Byte Order Mark.
142 if (willNowRead("\xef\xbb\xbf"))
143 advance(3);
144
145 while (!streamEof()) {
146 if (!skipWhitespaces())
147 break;
148
149 Kind TokenKind = classifyCurrentToken();
150 if (TokenKind == Kind::Invalid)
151 return getStringError("Invalid token found at position " + Twine(Pos));
152
153 const size_t TokenStart = Pos;
154 if (Error TokenError = consumeToken(TokenKind))
155 return std::move(TokenError);
156
157 RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
158 if (TokenKind == Kind::Identifier) {
159 processIdentifier(Token);
160 } else if (TokenKind == Kind::Int) {
161 uint32_t TokenInt;
162 if (!rcGetAsInteger(Token.value(), TokenInt)) {
163 // The integer has incorrect format or cannot be represented in
164 // a 32-bit integer.
165 return getStringError("Integer invalid or too large: " +
166 Token.value().str());
167 }
168 }
169
170 Result.push_back(Token);
171 }
172
173 return Result;
174}
175
176bool Tokenizer::advance(size_t Amount) {
177 Pos += Amount;
178 return !streamEof();
179}
180
181bool Tokenizer::skipWhitespaces() {
182 while (!streamEof() && std::isspace(Data[Pos]))
183 advance();
184 return !streamEof();
185}
186
187Error Tokenizer::consumeToken(const Kind TokenKind) {
188 switch (TokenKind) {
189 // One-character token consumption.
190#define TOKEN(Name)
191#define SHORT_TOKEN(Name, Ch) case Kind::Name:
192#include "ResourceScriptTokenList.h"
193#undef TOKEN
194#undef SHORT_TOKEN
195 advance();
196 return Error::success();
197
198 case Kind::Identifier:
199 while (!streamEof() && canContinueIdentifier())
200 advance();
201 return Error::success();
202
203 case Kind::Int:
204 while (!streamEof() && canContinueInt())
205 advance();
206 return Error::success();
207
208 case Kind::String:
209 // Consume the preceding 'L', if there is any.
210 if (std::toupper(Data[Pos]) == 'L')
211 advance();
212 // Consume the double-quote.
213 advance();
214
215 // Consume the characters until the end of the file, line or string.
216 while (true) {
217 if (streamEof()) {
218 return getStringError("Unterminated string literal.");
219 } else if (Data[Pos] == '"') {
220 // Consume the ending double-quote.
221 advance();
222 return Error::success();
223 } else if (Data[Pos] == '\n') {
224 return getStringError("String literal not terminated in the line.");
225 }
226
227 advance();
228 }
229
230 case Kind::Invalid:
231 assert(false && "Cannot consume an invalid token.");
232 }
Marek Sokolowskid0c5bfa2017-08-10 16:46:52 +0000233
Simon Pilgrimc3e546f2017-08-10 17:20:09 +0000234 llvm_unreachable("Unknown RCToken::Kind");
Marek Sokolowski719e22d2017-08-10 16:21:44 +0000235}
236
237bool Tokenizer::willNowRead(StringRef FollowingChars) const {
238 return Data.drop_front(Pos).startswith(FollowingChars);
239}
240
241bool Tokenizer::canStartIdentifier() const {
242 assert(!streamEof());
243
244 const char CurChar = Data[Pos];
245 return std::isalpha(CurChar) || CurChar == '_';
246}
247
248bool Tokenizer::canContinueIdentifier() const {
249 assert(!streamEof());
250 const char CurChar = Data[Pos];
251 return std::isalnum(CurChar) || CurChar == '_';
252}
253
254bool Tokenizer::canStartInt() const {
255 assert(!streamEof());
256 return std::isdigit(Data[Pos]);
257}
258
259bool Tokenizer::canContinueInt() const {
260 assert(!streamEof());
261 return std::isalnum(Data[Pos]);
262}
263
264bool Tokenizer::canStartString() const {
265 return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
266}
267
268bool Tokenizer::streamEof() const { return Pos == DataLength; }
269
270Kind Tokenizer::classifyCurrentToken() const {
271 if (canStartInt())
272 return Kind::Int;
273 if (canStartString())
274 return Kind::String;
275 // BEGIN and END are at this point of lexing recognized as identifiers.
276 if (canStartIdentifier())
277 return Kind::Identifier;
278
279 const char CurChar = Data[Pos];
280
281 switch (CurChar) {
282 // One-character token classification.
283#define TOKEN(Name)
284#define SHORT_TOKEN(Name, Ch) \
285 case Ch: \
286 return Kind::Name;
287#include "ResourceScriptTokenList.h"
288#undef TOKEN
289#undef SHORT_TOKEN
290
291 default:
292 return Kind::Invalid;
293 }
294}
295
296void Tokenizer::processIdentifier(RCToken &Token) const {
297 assert(Token.kind() == Kind::Identifier);
298 StringRef Name = Token.value();
299
300 if (Name.equals_lower("begin"))
301 Token = RCToken(Kind::BlockBegin, Name);
302 else if (Name.equals_lower("end"))
303 Token = RCToken(Kind::BlockEnd, Name);
304}
305
306} // anonymous namespace
307
308namespace llvm {
309
310Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
311 return Tokenizer(Input).run();
312}
313
314} // namespace llvm