blob: fc09a30ddd9928ae3b4bb7b9bc6f33ab3b1bae2f [file] [log] [blame]
Manuel Klimekf7f295f2013-05-14 09:13:00 +00001//===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9///
10/// \file
11/// \brief Recursive parser implementation for the matcher expression grammar.
12///
13//===----------------------------------------------------------------------===//
14
15#include <string>
16#include <vector>
17
18#include "clang/ASTMatchers/Dynamic/Parser.h"
19#include "clang/ASTMatchers/Dynamic/Registry.h"
20#include "clang/Basic/CharInfo.h"
21#include "llvm/ADT/Twine.h"
22
23namespace clang {
24namespace ast_matchers {
25namespace dynamic {
26
27/// \brief Simple structure to hold information for one token from the parser.
28struct Parser::TokenInfo {
29 /// \brief Different possible tokens.
30 enum TokenKind {
31 TK_Eof = 0,
32 TK_OpenParen = 1,
33 TK_CloseParen = 2,
34 TK_Comma = 3,
Samuel Benzaquen4f37d922013-06-03 19:31:08 +000035 TK_Period = 4,
36 TK_Literal = 5,
37 TK_Ident = 6,
38 TK_InvalidChar = 7,
39 TK_Error = 8
Manuel Klimekf7f295f2013-05-14 09:13:00 +000040 };
41
Samuel Benzaquen4f37d922013-06-03 19:31:08 +000042 /// \brief Some known identifiers.
43 static const char* const ID_Bind;
44
Manuel Klimekf7f295f2013-05-14 09:13:00 +000045 TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
46
47 StringRef Text;
48 TokenKind Kind;
49 SourceRange Range;
50 VariantValue Value;
51};
52
Samuel Benzaquen4f37d922013-06-03 19:31:08 +000053const char* const Parser::TokenInfo::ID_Bind = "bind";
54
Manuel Klimekf7f295f2013-05-14 09:13:00 +000055/// \brief Simple tokenizer for the parser.
56class Parser::CodeTokenizer {
57public:
58 explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
59 : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error) {
60 NextToken = getNextToken();
61 }
62
63 /// \brief Returns but doesn't consume the next token.
64 const TokenInfo &peekNextToken() const { return NextToken; }
65
66 /// \brief Consumes and returns the next token.
67 TokenInfo consumeNextToken() {
68 TokenInfo ThisToken = NextToken;
69 NextToken = getNextToken();
70 return ThisToken;
71 }
72
73 TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
74
75private:
76 TokenInfo getNextToken() {
77 consumeWhitespace();
78 TokenInfo Result;
79 Result.Range.Start = currentLocation();
80
81 if (Code.empty()) {
82 Result.Kind = TokenInfo::TK_Eof;
83 Result.Text = "";
84 return Result;
85 }
86
87 switch (Code[0]) {
88 case ',':
89 Result.Kind = TokenInfo::TK_Comma;
90 Result.Text = Code.substr(0, 1);
91 Code = Code.drop_front();
92 break;
Samuel Benzaquen4f37d922013-06-03 19:31:08 +000093 case '.':
94 Result.Kind = TokenInfo::TK_Period;
95 Result.Text = Code.substr(0, 1);
96 Code = Code.drop_front();
97 break;
Manuel Klimekf7f295f2013-05-14 09:13:00 +000098 case '(':
99 Result.Kind = TokenInfo::TK_OpenParen;
100 Result.Text = Code.substr(0, 1);
101 Code = Code.drop_front();
102 break;
103 case ')':
104 Result.Kind = TokenInfo::TK_CloseParen;
105 Result.Text = Code.substr(0, 1);
106 Code = Code.drop_front();
107 break;
108
109 case '"':
110 case '\'':
111 // Parse a string literal.
112 consumeStringLiteral(&Result);
113 break;
114
Samuel Benzaquen7a337af2013-06-04 15:46:22 +0000115 case '0': case '1': case '2': case '3': case '4':
116 case '5': case '6': case '7': case '8': case '9':
117 // Parse an unsigned literal.
118 consumeUnsignedLiteral(&Result);
119 break;
120
Manuel Klimekf7f295f2013-05-14 09:13:00 +0000121 default:
122 if (isAlphanumeric(Code[0])) {
123 // Parse an identifier
124 size_t TokenLength = 1;
125 while (TokenLength < Code.size() && isAlphanumeric(Code[TokenLength]))
126 ++TokenLength;
127 Result.Kind = TokenInfo::TK_Ident;
128 Result.Text = Code.substr(0, TokenLength);
129 Code = Code.drop_front(TokenLength);
130 } else {
131 Result.Kind = TokenInfo::TK_InvalidChar;
132 Result.Text = Code.substr(0, 1);
133 Code = Code.drop_front(1);
134 }
135 break;
136 }
137
138 Result.Range.End = currentLocation();
139 return Result;
140 }
141
Samuel Benzaquen7a337af2013-06-04 15:46:22 +0000142 /// \brief Consume an unsigned literal.
143 void consumeUnsignedLiteral(TokenInfo *Result) {
144 unsigned Length = 1;
145 if (Code.size() > 1) {
146 // Consume the 'x' or 'b' radix modifier, if present.
147 switch (toLowercase(Code[1])) {
148 case 'x': case 'b': Length = 2;
149 }
150 }
151 while (Length < Code.size() && isHexDigit(Code[Length]))
152 ++Length;
153
154 Result->Text = Code.substr(0, Length);
155 Code = Code.drop_front(Length);
156
157 unsigned Value;
158 if (!Result->Text.getAsInteger(0, Value)) {
159 Result->Kind = TokenInfo::TK_Literal;
160 Result->Value = Value;
161 } else {
162 SourceRange Range;
163 Range.Start = Result->Range.Start;
164 Range.End = currentLocation();
165 Error->pushErrorFrame(Range, Error->ET_ParserUnsignedError)
166 << Result->Text;
167 Result->Kind = TokenInfo::TK_Error;
168 }
169 }
170
Manuel Klimekf7f295f2013-05-14 09:13:00 +0000171 /// \brief Consume a string literal.
172 ///
173 /// \c Code must be positioned at the start of the literal (the opening
174 /// quote). Consumed until it finds the same closing quote character.
175 void consumeStringLiteral(TokenInfo *Result) {
176 bool InEscape = false;
177 const char Marker = Code[0];
178 for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
179 if (InEscape) {
180 InEscape = false;
181 continue;
182 }
183 if (Code[Length] == '\\') {
184 InEscape = true;
185 continue;
186 }
187 if (Code[Length] == Marker) {
188 Result->Kind = TokenInfo::TK_Literal;
189 Result->Text = Code.substr(0, Length + 1);
190 Result->Value = Code.substr(1, Length - 1).str();
191 Code = Code.drop_front(Length + 1);
192 return;
193 }
194 }
195
196 StringRef ErrorText = Code;
197 Code = Code.drop_front(Code.size());
198 SourceRange Range;
199 Range.Start = Result->Range.Start;
200 Range.End = currentLocation();
201 Error->pushErrorFrame(Range, Error->ET_ParserStringError)
202 << ErrorText;
203 Result->Kind = TokenInfo::TK_Error;
204 }
205
206 /// \brief Consume all leading whitespace from \c Code.
207 void consumeWhitespace() {
208 while (!Code.empty() && isWhitespace(Code[0])) {
209 if (Code[0] == '\n') {
210 ++Line;
211 StartOfLine = Code.drop_front();
212 }
213 Code = Code.drop_front();
214 }
215 }
216
217 SourceLocation currentLocation() {
218 SourceLocation Location;
219 Location.Line = Line;
220 Location.Column = Code.data() - StartOfLine.data() + 1;
221 return Location;
222 }
223
224 StringRef Code;
225 StringRef StartOfLine;
226 unsigned Line;
227 Diagnostics *Error;
228 TokenInfo NextToken;
229};
230
231Parser::Sema::~Sema() {}
232
233/// \brief Parse and validate a matcher expression.
234/// \return \c true on success, in which case \c Value has the matcher parsed.
235/// If the input is malformed, or some argument has an error, it
236/// returns \c false.
237bool Parser::parseMatcherExpressionImpl(VariantValue *Value) {
238 const TokenInfo NameToken = Tokenizer->consumeNextToken();
239 assert(NameToken.Kind == TokenInfo::TK_Ident);
240 const TokenInfo OpenToken = Tokenizer->consumeNextToken();
241 if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
242 Error->pushErrorFrame(OpenToken.Range, Error->ET_ParserNoOpenParen)
243 << OpenToken.Text;
244 return false;
245 }
246
247 std::vector<ParserValue> Args;
248 TokenInfo EndToken;
249 while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
250 if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
251 // End of args.
252 EndToken = Tokenizer->consumeNextToken();
253 break;
254 }
255 if (Args.size() > 0) {
256 // We must find a , token to continue.
257 const TokenInfo CommaToken = Tokenizer->consumeNextToken();
258 if (CommaToken.Kind != TokenInfo::TK_Comma) {
259 Error->pushErrorFrame(CommaToken.Range, Error->ET_ParserNoComma)
260 << CommaToken.Text;
261 return false;
262 }
263 }
264
265 ParserValue ArgValue;
266 ArgValue.Text = Tokenizer->peekNextToken().Text;
267 ArgValue.Range = Tokenizer->peekNextToken().Range;
268 if (!parseExpressionImpl(&ArgValue.Value)) {
269 Error->pushErrorFrame(NameToken.Range,
270 Error->ET_ParserMatcherArgFailure)
271 << (Args.size() + 1) << NameToken.Text;
272 return false;
273 }
274
275 Args.push_back(ArgValue);
276 }
277
278 if (EndToken.Kind == TokenInfo::TK_Eof) {
279 Error->pushErrorFrame(OpenToken.Range, Error->ET_ParserNoCloseParen);
280 return false;
281 }
282
Samuel Benzaquen4f37d922013-06-03 19:31:08 +0000283 std::string BindID;
284 if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) {
285 // Parse .bind("foo")
286 Tokenizer->consumeNextToken(); // consume the period.
287 const TokenInfo BindToken = Tokenizer->consumeNextToken();
288 const TokenInfo OpenToken = Tokenizer->consumeNextToken();
289 const TokenInfo IDToken = Tokenizer->consumeNextToken();
290 const TokenInfo CloseToken = Tokenizer->consumeNextToken();
291
292 // TODO: We could use different error codes for each/some to be more
293 // explicit about the syntax error.
294 if (BindToken.Kind != TokenInfo::TK_Ident ||
295 BindToken.Text != TokenInfo::ID_Bind) {
296 Error->pushErrorFrame(BindToken.Range, Error->ET_ParserMalformedBindExpr);
297 return false;
298 }
299 if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
300 Error->pushErrorFrame(OpenToken.Range, Error->ET_ParserMalformedBindExpr);
301 return false;
302 }
303 if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) {
304 Error->pushErrorFrame(IDToken.Range, Error->ET_ParserMalformedBindExpr);
305 return false;
306 }
307 if (CloseToken.Kind != TokenInfo::TK_CloseParen) {
308 Error->pushErrorFrame(CloseToken.Range,
309 Error->ET_ParserMalformedBindExpr);
310 return false;
311 }
312 BindID = IDToken.Value.getString();
313 }
314
Manuel Klimekf7f295f2013-05-14 09:13:00 +0000315 // Merge the start and end infos.
316 SourceRange MatcherRange = NameToken.Range;
317 MatcherRange.End = EndToken.Range.End;
Samuel Benzaquenef7eb022013-06-21 15:51:31 +0000318 MatcherList Result = S->actOnMatcherExpression(
Samuel Benzaquen4f37d922013-06-03 19:31:08 +0000319 NameToken.Text, MatcherRange, BindID, Args, Error);
Samuel Benzaquenef7eb022013-06-21 15:51:31 +0000320 if (Result.empty()) {
Manuel Klimekf7f295f2013-05-14 09:13:00 +0000321 Error->pushErrorFrame(NameToken.Range, Error->ET_ParserMatcherFailure)
322 << NameToken.Text;
323 return false;
324 }
325
Samuel Benzaquenef7eb022013-06-21 15:51:31 +0000326 *Value = Result;
Manuel Klimekf7f295f2013-05-14 09:13:00 +0000327 return true;
328}
329
330/// \brief Parse an <Expresssion>
331bool Parser::parseExpressionImpl(VariantValue *Value) {
332 switch (Tokenizer->nextTokenKind()) {
333 case TokenInfo::TK_Literal:
334 *Value = Tokenizer->consumeNextToken().Value;
335 return true;
336
337 case TokenInfo::TK_Ident:
338 return parseMatcherExpressionImpl(Value);
339
340 case TokenInfo::TK_Eof:
341 Error->pushErrorFrame(Tokenizer->consumeNextToken().Range,
342 Error->ET_ParserNoCode);
343 return false;
344
345 case TokenInfo::TK_Error:
346 // This error was already reported by the tokenizer.
347 return false;
348
349 case TokenInfo::TK_OpenParen:
350 case TokenInfo::TK_CloseParen:
351 case TokenInfo::TK_Comma:
Samuel Benzaquen4f37d922013-06-03 19:31:08 +0000352 case TokenInfo::TK_Period:
Manuel Klimekf7f295f2013-05-14 09:13:00 +0000353 case TokenInfo::TK_InvalidChar:
354 const TokenInfo Token = Tokenizer->consumeNextToken();
355 Error->pushErrorFrame(Token.Range, Error->ET_ParserInvalidToken)
356 << Token.Text;
357 return false;
358 }
359
360 llvm_unreachable("Unknown token kind.");
361}
362
363Parser::Parser(CodeTokenizer *Tokenizer, Sema *S,
364 Diagnostics *Error)
365 : Tokenizer(Tokenizer), S(S), Error(Error) {}
366
367class RegistrySema : public Parser::Sema {
368public:
369 virtual ~RegistrySema() {}
Samuel Benzaquenef7eb022013-06-21 15:51:31 +0000370 MatcherList actOnMatcherExpression(StringRef MatcherName,
371 const SourceRange &NameRange,
372 StringRef BindID,
373 ArrayRef<ParserValue> Args,
374 Diagnostics *Error) {
Samuel Benzaquen4f37d922013-06-03 19:31:08 +0000375 if (BindID.empty()) {
376 return Registry::constructMatcher(MatcherName, NameRange, Args, Error);
377 } else {
378 return Registry::constructBoundMatcher(MatcherName, NameRange, BindID,
379 Args, Error);
380 }
Manuel Klimekf7f295f2013-05-14 09:13:00 +0000381 }
382};
383
384bool Parser::parseExpression(StringRef Code, VariantValue *Value,
385 Diagnostics *Error) {
386 RegistrySema S;
387 return parseExpression(Code, &S, Value, Error);
388}
389
390bool Parser::parseExpression(StringRef Code, Sema *S,
391 VariantValue *Value, Diagnostics *Error) {
392 CodeTokenizer Tokenizer(Code, Error);
Samuel Benzaquen4f37d922013-06-03 19:31:08 +0000393 if (!Parser(&Tokenizer, S, Error).parseExpressionImpl(Value)) return false;
394 if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) {
395 Error->pushErrorFrame(Tokenizer.peekNextToken().Range,
396 Error->ET_ParserTrailingCode);
397 return false;
398 }
399 return true;
Manuel Klimekf7f295f2013-05-14 09:13:00 +0000400}
401
402DynTypedMatcher *Parser::parseMatcherExpression(StringRef Code,
403 Diagnostics *Error) {
404 RegistrySema S;
405 return parseMatcherExpression(Code, &S, Error);
406}
407
408DynTypedMatcher *Parser::parseMatcherExpression(StringRef Code,
409 Parser::Sema *S,
410 Diagnostics *Error) {
411 VariantValue Value;
412 if (!parseExpression(Code, S, &Value, Error))
413 return NULL;
Samuel Benzaquenef7eb022013-06-21 15:51:31 +0000414 if (!Value.isMatchers()) {
Manuel Klimekf7f295f2013-05-14 09:13:00 +0000415 Error->pushErrorFrame(SourceRange(), Error->ET_ParserNotAMatcher);
416 return NULL;
417 }
Samuel Benzaquenef7eb022013-06-21 15:51:31 +0000418 if (Value.getMatchers().matchers().size() != 1) {
419 Error->pushErrorFrame(SourceRange(), Error->ET_ParserOverloadedType)
420 << Value.getTypeAsString();
421 return NULL;
422 }
423 return Value.getMatchers().matchers()[0]->clone();
Manuel Klimekf7f295f2013-05-14 09:13:00 +0000424}
425
426} // namespace dynamic
427} // namespace ast_matchers
428} // namespace clang