blob: 4cc90a29afce015f06525b57970e1bfef34174e7 [file] [log] [blame]
temporal40ee5512008-07-10 02:12:20 +00001// Protocol Buffers - Google's data interchange format
kenton@google.com24bf56f2008-09-24 20:31:01 +00002// Copyright 2008 Google Inc. All rights reserved.
temporal40ee5512008-07-10 02:12:20 +00003// http://code.google.com/p/protobuf/
4//
kenton@google.com24bf56f2008-09-24 20:31:01 +00005// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
temporal40ee5512008-07-10 02:12:20 +00008//
kenton@google.com24bf56f2008-09-24 20:31:01 +00009// * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11// * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15// * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
temporal40ee5512008-07-10 02:12:20 +000018//
kenton@google.com24bf56f2008-09-24 20:31:01 +000019// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
temporal40ee5512008-07-10 02:12:20 +000030
31// Author: kenton@google.com (Kenton Varda)
32// Based on original Protocol Buffers design by
33// Sanjay Ghemawat, Jeff Dean, and others.
34//
35// Implements parsing of .proto files to FileDescriptorProtos.
36
37#ifndef GOOGLE_PROTOBUF_COMPILER_PARSER_H__
38#define GOOGLE_PROTOBUF_COMPILER_PARSER_H__
39
40#include <map>
41#include <string>
42#include <utility>
43#include <google/protobuf/stubs/common.h>
44#include <google/protobuf/descriptor.h>
45#include <google/protobuf/descriptor.pb.h>
46#include <google/protobuf/repeated_field.h>
47#include <google/protobuf/io/tokenizer.h>
48
49namespace google {
50namespace protobuf { class Message; }
51
52namespace protobuf {
53namespace compiler {
54
55// Defined in this file.
56class Parser;
57class SourceLocationTable;
58
59// Implements parsing of protocol definitions (such as .proto files).
60//
61// Note that most users will be more interested in the Importer class.
62// Parser is a lower-level class which simply converts a single .proto file
63// to a FileDescriptorProto. It does not resolve import directives or perform
64// many other kinds of validation needed to construct a complete
65// FileDescriptor.
66class LIBPROTOBUF_EXPORT Parser {
67 public:
68 Parser();
69 ~Parser();
70
71 // Parse the entire input and construct a FileDescriptorProto representing
72 // it. Returns true if no errors occurred, false otherwise.
73 bool Parse(io::Tokenizer* input, FileDescriptorProto* file);
74
75 // Optional fetaures:
76
liujisi@google.com33165fe2010-11-02 13:14:58 +000077 // DEPRECATED: New code should use the SourceCodeInfo embedded in the
78 // FileDescriptorProto.
79 //
temporal40ee5512008-07-10 02:12:20 +000080 // Requests that locations of certain definitions be recorded to the given
81 // SourceLocationTable while parsing. This can be used to look up exact line
82 // and column numbers for errors reported by DescriptorPool during validation.
83 // Set to NULL (the default) to discard source location information.
84 void RecordSourceLocationsTo(SourceLocationTable* location_table) {
85 source_location_table_ = location_table;
86 }
87
liujisi@google.com33165fe2010-11-02 13:14:58 +000088 // Requests that errors be recorded to the given ErrorCollector while
temporal40ee5512008-07-10 02:12:20 +000089 // parsing. Set to NULL (the default) to discard error messages.
90 void RecordErrorsTo(io::ErrorCollector* error_collector) {
91 error_collector_ = error_collector;
92 }
93
94 // Returns the identifier used in the "syntax = " declaration, if one was
95 // seen during the last call to Parse(), or the empty string otherwise.
kenton@google.comd37d46d2009-04-25 02:53:47 +000096 const string& GetSyntaxIdentifier() { return syntax_identifier_; }
temporal40ee5512008-07-10 02:12:20 +000097
98 // If set true, input files will be required to begin with a syntax
99 // identifier. Otherwise, files may omit this. If a syntax identifier
100 // is provided, it must be 'syntax = "proto2";' and must appear at the
101 // top of this file regardless of whether or not it was required.
102 void SetRequireSyntaxIdentifier(bool value) {
103 require_syntax_identifier_ = value;
104 }
105
kenton@google.comd37d46d2009-04-25 02:53:47 +0000106 // Call SetStopAfterSyntaxIdentifier(true) to tell the parser to stop
107 // parsing as soon as it has seen the syntax identifier, or lack thereof.
108 // This is useful for quickly identifying the syntax of the file without
109 // parsing the whole thing. If this is enabled, no error will be recorded
110 // if the syntax identifier is something other than "proto2" (since
111 // presumably the caller intends to deal with that), but other kinds of
112 // errors (e.g. parse errors) will still be reported. When this is enabled,
113 // you may pass a NULL FileDescriptorProto to Parse().
114 void SetStopAfterSyntaxIdentifier(bool value) {
115 stop_after_syntax_identifier_ = value;
116 }
117
temporal40ee5512008-07-10 02:12:20 +0000118 private:
119 // =================================================================
120 // Error recovery helpers
121
122 // Consume the rest of the current statement. This consumes tokens
123 // until it sees one of:
124 // ';' Consumes the token and returns.
125 // '{' Consumes the brace then calls SkipRestOfBlock().
126 // '}' Returns without consuming.
127 // EOF Returns (can't consume).
128 // The Parser often calls SkipStatement() after encountering a syntax
129 // error. This allows it to go on parsing the following lines, allowing
130 // it to report more than just one error in the file.
131 void SkipStatement();
132
133 // Consume the rest of the current block, including nested blocks,
134 // ending after the closing '}' is encountered and consumed, or at EOF.
135 void SkipRestOfBlock();
136
137 // -----------------------------------------------------------------
138 // Single-token consuming helpers
139 //
140 // These make parsing code more readable.
141
142 // True if the current token is TYPE_END.
143 inline bool AtEnd();
144
145 // True if the next token matches the given text.
146 inline bool LookingAt(const char* text);
147 // True if the next token is of the given type.
148 inline bool LookingAtType(io::Tokenizer::TokenType token_type);
149
150 // If the next token exactly matches the text given, consume it and return
151 // true. Otherwise, return false without logging an error.
152 bool TryConsume(const char* text);
153
154 // These attempt to read some kind of token from the input. If successful,
155 // they return true. Otherwise they return false and add the given error
156 // to the error list.
157
158 // Consume a token with the exact text given.
159 bool Consume(const char* text, const char* error);
160 // Same as above, but automatically generates the error "Expected \"text\".",
161 // where "text" is the expected token text.
162 bool Consume(const char* text);
163 // Consume a token of type IDENTIFIER and store its text in "output".
164 bool ConsumeIdentifier(string* output, const char* error);
165 // Consume an integer and store its value in "output".
166 bool ConsumeInteger(int* output, const char* error);
167 // Consume a 64-bit integer and store its value in "output". If the value
168 // is greater than max_value, an error will be reported.
169 bool ConsumeInteger64(uint64 max_value, uint64* output, const char* error);
170 // Consume a number and store its value in "output". This will accept
171 // tokens of either INTEGER or FLOAT type.
172 bool ConsumeNumber(double* output, const char* error);
173 // Consume a string literal and store its (unescaped) value in "output".
174 bool ConsumeString(string* output, const char* error);
175
176 // -----------------------------------------------------------------
177 // Error logging helpers
178
179 // Invokes error_collector_->AddError(), if error_collector_ is not NULL.
180 void AddError(int line, int column, const string& error);
181
182 // Invokes error_collector_->AddError() with the line and column number
183 // of the current token.
184 void AddError(const string& error);
185
liujisi@google.com33165fe2010-11-02 13:14:58 +0000186 // Records a location in the SourceCodeInfo.location table (see
187 // descriptor.proto). We use RAII to ensure that the start and end locations
188 // are recorded -- the constructor records the start location and the
189 // destructor records the end location. Since the parser is
190 // recursive-descent, this works out beautifully.
191 class LIBPROTOBUF_EXPORT LocationRecorder {
192 public:
193 // Construct the file's "root" location.
194 LocationRecorder(Parser* parser);
temporal40ee5512008-07-10 02:12:20 +0000195
liujisi@google.com33165fe2010-11-02 13:14:58 +0000196 // Construct a location that represents a declaration nested within the
197 // given parent. E.g. a field's location is nested within the location
198 // for a message type. The parent's path will be copied, so you should
199 // call AddPath() only to add the path components leading from the parent
200 // to the child (as opposed to leading from the root to the child).
201 LocationRecorder(const LocationRecorder& parent);
202
203 // Convenience constructors that call AddPath() one or two times.
204 LocationRecorder(const LocationRecorder& parent, int path1);
205 LocationRecorder(const LocationRecorder& parent, int path1, int path2);
206
207 ~LocationRecorder();
208
209 // Add a path component. See SourceCodeInfo.Location.path in
210 // descriptor.proto.
211 void AddPath(int path_component);
212
213 // By default the location is considered to start at the current token at
214 // the time the LocationRecorder is created. StartAt() sets the start
215 // location to the given token instead.
216 void StartAt(const io::Tokenizer::Token& token);
217
218 // By default the location is considered to end at the previous token at
219 // the time the LocationRecorder is destroyed. EndAt() sets the end
220 // location to the given token instead.
221 void EndAt(const io::Tokenizer::Token& token);
222
223 // Records the start point of this location to the SourceLocationTable that
224 // was passed to RecordSourceLocationsTo(), if any. SourceLocationTable
225 // is an older way of keeping track of source locations which is still
226 // used in some places.
227 void RecordLegacyLocation(const Message* descriptor,
228 DescriptorPool::ErrorCollector::ErrorLocation location);
229
230 private:
231 Parser* parser_;
232 SourceCodeInfo::Location* location_;
233
234 void Init(const LocationRecorder& parent);
235 };
temporal40ee5512008-07-10 02:12:20 +0000236
237 // =================================================================
238 // Parsers for various language constructs
239
240 // Parses the "syntax = \"proto2\";" line at the top of the file. Returns
241 // false if it failed to parse or if the syntax identifier was not
242 // recognized.
243 bool ParseSyntaxIdentifier();
244
245 // These methods parse various individual bits of code. They return
246 // false if they completely fail to parse the construct. In this case,
247 // it is probably necessary to skip the rest of the statement to recover.
248 // However, if these methods return true, it does NOT mean that there
249 // were no errors; only that there were no *syntax* errors. For instance,
250 // if a service method is defined using proper syntax but uses a primitive
251 // type as its input or output, ParseMethodField() still returns true
252 // and only reports the error by calling AddError(). In practice, this
253 // makes logic much simpler for the caller.
254
255 // Parse a top-level message, enum, service, etc.
liujisi@google.com33165fe2010-11-02 13:14:58 +0000256 bool ParseTopLevelStatement(FileDescriptorProto* file,
257 const LocationRecorder& root_location);
temporal40ee5512008-07-10 02:12:20 +0000258
259 // Parse various language high-level language construrcts.
liujisi@google.com33165fe2010-11-02 13:14:58 +0000260 bool ParseMessageDefinition(DescriptorProto* message,
261 const LocationRecorder& message_location);
262 bool ParseEnumDefinition(EnumDescriptorProto* enum_type,
263 const LocationRecorder& enum_location);
264 bool ParseServiceDefinition(ServiceDescriptorProto* service,
265 const LocationRecorder& service_location);
266 bool ParsePackage(FileDescriptorProto* file,
267 const LocationRecorder& root_location);
268 bool ParseImport(string* import_filename,
269 const LocationRecorder& root_location,
270 int index);
271 bool ParseOption(Message* options,
272 const LocationRecorder& options_location);
temporal40ee5512008-07-10 02:12:20 +0000273
274 // These methods parse the contents of a message, enum, or service type and
275 // add them to the given object. They consume the entire block including
276 // the beginning and ending brace.
liujisi@google.com33165fe2010-11-02 13:14:58 +0000277 bool ParseMessageBlock(DescriptorProto* message,
278 const LocationRecorder& message_location);
279 bool ParseEnumBlock(EnumDescriptorProto* enum_type,
280 const LocationRecorder& enum_location);
281 bool ParseServiceBlock(ServiceDescriptorProto* service,
282 const LocationRecorder& service_location);
temporal40ee5512008-07-10 02:12:20 +0000283
284 // Parse one statement within a message, enum, or service block, inclunding
285 // final semicolon.
liujisi@google.com33165fe2010-11-02 13:14:58 +0000286 bool ParseMessageStatement(DescriptorProto* message,
287 const LocationRecorder& message_location);
288 bool ParseEnumStatement(EnumDescriptorProto* message,
289 const LocationRecorder& enum_location);
290 bool ParseServiceStatement(ServiceDescriptorProto* message,
291 const LocationRecorder& service_location);
temporal40ee5512008-07-10 02:12:20 +0000292
293 // Parse a field of a message. If the field is a group, its type will be
294 // added to "messages".
liujisi@google.com33165fe2010-11-02 13:14:58 +0000295 //
296 // parent_location and location_field_number_for_nested_type are needed when
297 // parsing groups -- we need to generate a nested message type within the
298 // parent and record its location accordingly. Since the parent could be
299 // either a FileDescriptorProto or a DescriptorProto, we must pass in the
300 // correct field number to use.
temporal40ee5512008-07-10 02:12:20 +0000301 bool ParseMessageField(FieldDescriptorProto* field,
liujisi@google.com33165fe2010-11-02 13:14:58 +0000302 RepeatedPtrField<DescriptorProto>* messages,
303 const LocationRecorder& parent_location,
304 int location_field_number_for_nested_type,
305 const LocationRecorder& field_location);
temporal40ee5512008-07-10 02:12:20 +0000306
307 // Parse an "extensions" declaration.
liujisi@google.com33165fe2010-11-02 13:14:58 +0000308 bool ParseExtensions(DescriptorProto* message,
309 const LocationRecorder& extensions_location);
temporal40ee5512008-07-10 02:12:20 +0000310
liujisi@google.com33165fe2010-11-02 13:14:58 +0000311 // Parse an "extend" declaration. (See also comments for
312 // ParseMessageField().)
temporal40ee5512008-07-10 02:12:20 +0000313 bool ParseExtend(RepeatedPtrField<FieldDescriptorProto>* extensions,
liujisi@google.com33165fe2010-11-02 13:14:58 +0000314 RepeatedPtrField<DescriptorProto>* messages,
315 const LocationRecorder& parent_location,
316 int location_field_number_for_nested_type,
317 const LocationRecorder& extend_location);
temporal40ee5512008-07-10 02:12:20 +0000318
319 // Parse a single enum value within an enum block.
liujisi@google.com33165fe2010-11-02 13:14:58 +0000320 bool ParseEnumConstant(EnumValueDescriptorProto* enum_value,
321 const LocationRecorder& enum_value_location);
temporal40ee5512008-07-10 02:12:20 +0000322
kenton@google.com26bd9ee2008-11-21 00:06:27 +0000323 // Parse enum constant options, i.e. the list in square brackets at the end
324 // of the enum constant value definition.
liujisi@google.com33165fe2010-11-02 13:14:58 +0000325 bool ParseEnumConstantOptions(EnumValueDescriptorProto* value,
326 const LocationRecorder& enum_value_location);
kenton@google.com26bd9ee2008-11-21 00:06:27 +0000327
temporal40ee5512008-07-10 02:12:20 +0000328 // Parse a single method within a service definition.
liujisi@google.com33165fe2010-11-02 13:14:58 +0000329 bool ParseServiceMethod(MethodDescriptorProto* method,
330 const LocationRecorder& method_location);
temporal40ee5512008-07-10 02:12:20 +0000331
332 // Parse "required", "optional", or "repeated" and fill in "label"
333 // with the value.
334 bool ParseLabel(FieldDescriptorProto::Label* label);
335
336 // Parse a type name and fill in "type" (if it is a primitive) or
337 // "type_name" (if it is not) with the type parsed.
338 bool ParseType(FieldDescriptorProto::Type* type,
339 string* type_name);
340 // Parse a user-defined type and fill in "type_name" with the name.
341 // If a primitive type is named, it is treated as an error.
342 bool ParseUserDefinedType(string* type_name);
343
344 // Parses field options, i.e. the stuff in square brackets at the end
345 // of a field definition. Also parses default value.
liujisi@google.com33165fe2010-11-02 13:14:58 +0000346 bool ParseFieldOptions(FieldDescriptorProto* field,
347 const LocationRecorder& field_location);
temporal40ee5512008-07-10 02:12:20 +0000348
349 // Parse the "default" option. This needs special handling because its
350 // type is the field's type.
liujisi@google.com33165fe2010-11-02 13:14:58 +0000351 bool ParseDefaultAssignment(FieldDescriptorProto* field,
352 const LocationRecorder& field_location);
temporal40ee5512008-07-10 02:12:20 +0000353
354 // Parse a single option name/value pair, e.g. "ctype = CORD". The name
355 // identifies a field of the given Message, and the value of that field
356 // is set to the parsed value.
liujisi@google.com33165fe2010-11-02 13:14:58 +0000357 bool ParseOptionAssignment(Message* options,
358 const LocationRecorder& options_location);
temporal40ee5512008-07-10 02:12:20 +0000359
kenton@google.com24bf56f2008-09-24 20:31:01 +0000360 // Parses a single part of a multipart option name. A multipart name consists
361 // of names separated by dots. Each name is either an identifier or a series
362 // of identifiers separated by dots and enclosed in parentheses. E.g.,
363 // "foo.(bar.baz).qux".
liujisi@google.com33165fe2010-11-02 13:14:58 +0000364 bool ParseOptionNamePart(UninterpretedOption* uninterpreted_option,
365 const LocationRecorder& part_location);
366
367 // Parses a string surrounded by balanced braces. Strips off the outer
368 // braces and stores the enclosed string in *value.
369 // E.g.,
370 // { foo } *value gets 'foo'
371 // { foo { bar: box } } *value gets 'foo { bar: box }'
372 // {} *value gets ''
373 //
374 // REQUIRES: LookingAt("{")
375 // When finished successfully, we are looking at the first token past
376 // the ending brace.
377 bool ParseUninterpretedBlock(string* value);
kenton@google.com24bf56f2008-09-24 20:31:01 +0000378
temporal40ee5512008-07-10 02:12:20 +0000379 // =================================================================
380
381 io::Tokenizer* input_;
382 io::ErrorCollector* error_collector_;
liujisi@google.com33165fe2010-11-02 13:14:58 +0000383 SourceCodeInfo* source_code_info_;
384 SourceLocationTable* source_location_table_; // legacy
temporal40ee5512008-07-10 02:12:20 +0000385 bool had_errors_;
386 bool require_syntax_identifier_;
kenton@google.comd37d46d2009-04-25 02:53:47 +0000387 bool stop_after_syntax_identifier_;
temporal40ee5512008-07-10 02:12:20 +0000388 string syntax_identifier_;
389
390 GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Parser);
391};
392
393// A table mapping (descriptor, ErrorLocation) pairs -- as reported by
394// DescriptorPool when validating descriptors -- to line and column numbers
395// within the original source code.
liujisi@google.com33165fe2010-11-02 13:14:58 +0000396//
397// This is semi-obsolete: FileDescriptorProto.source_code_info now contains
398// far more complete information about source locations. However, as of this
399// writing you still need to use SourceLocationTable when integrating with
400// DescriptorPool.
temporal40ee5512008-07-10 02:12:20 +0000401class LIBPROTOBUF_EXPORT SourceLocationTable {
402 public:
403 SourceLocationTable();
404 ~SourceLocationTable();
405
406 // Finds the precise location of the given error and fills in *line and
407 // *column with the line and column numbers. If not found, sets *line to
408 // -1 and *column to 0 (since line = -1 is used to mean "error has no exact
409 // location" in the ErrorCollector interface). Returns true if found, false
410 // otherwise.
411 bool Find(const Message* descriptor,
412 DescriptorPool::ErrorCollector::ErrorLocation location,
413 int* line, int* column) const;
414
415 // Adds a location to the table.
416 void Add(const Message* descriptor,
417 DescriptorPool::ErrorCollector::ErrorLocation location,
418 int line, int column);
419
420 // Clears the contents of the table.
421 void Clear();
422
423 private:
424 typedef map<
425 pair<const Message*, DescriptorPool::ErrorCollector::ErrorLocation>,
426 pair<int, int> > LocationMap;
427 LocationMap location_map_;
428};
429
430} // namespace compiler
431} // namespace protobuf
432
433} // namespace google
434#endif // GOOGLE_PROTOBUF_COMPILER_PARSER_H__