blob: c0513a8b241339b67dfa7c6d0cf7712a65bbc813 [file] [log] [blame]
Joe Tsai27c2a762018-08-01 16:48:18 -07001// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package text
6
7import (
8 "bytes"
9 "io"
10 "regexp"
Herbie Onga3369c52019-04-23 00:24:46 -070011 "strconv"
Joe Tsai27c2a762018-08-01 16:48:18 -070012 "unicode/utf8"
13
Damien Neile89e6242019-05-13 23:55:40 -070014 "google.golang.org/protobuf/internal/errors"
15 "google.golang.org/protobuf/reflect/protoreflect"
Joe Tsai27c2a762018-08-01 16:48:18 -070016)
17
18type syntaxError struct{ error }
19
20func newSyntaxError(f string, x ...interface{}) error {
21 return syntaxError{errors.New(f, x...)}
22}
23
24// Unmarshal parses b as the proto text format.
25// It returns a Value, which is always of the Message type.
26func Unmarshal(b []byte) (Value, error) {
27 p := decoder{in: b}
28 p.consume(0) // trim leading spaces or comments
29 v, err := p.unmarshalMessage(false)
Damien Neil8c86fc52019-06-19 09:28:29 -070030 if err != nil {
Joe Tsai27c2a762018-08-01 16:48:18 -070031 if e, ok := err.(syntaxError); ok {
32 b = b[:len(b)-len(p.in)] // consumed input
33 line := bytes.Count(b, []byte("\n")) + 1
34 if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
35 b = b[i+1:]
36 }
37 column := utf8.RuneCount(b) + 1 // ignore multi-rune characters
38 err = errors.New("syntax error (line %d:%d): %v", line, column, e.error)
39 }
40 return Value{}, err
41 }
42 if len(p.in) > 0 {
43 return Value{}, errors.New("%d bytes of unconsumed input", len(p.in))
44 }
Damien Neil8c86fc52019-06-19 09:28:29 -070045 return v, nil
Joe Tsai27c2a762018-08-01 16:48:18 -070046}
47
48type decoder struct {
Damien Neil8c86fc52019-06-19 09:28:29 -070049 in []byte
Joe Tsai27c2a762018-08-01 16:48:18 -070050}
51
52func (p *decoder) unmarshalList() (Value, error) {
53 b := p.in
54 var elems []Value
55 if err := p.consumeChar('[', "at start of list"); err != nil {
56 return Value{}, err
57 }
58 if len(p.in) > 0 && p.in[0] != ']' {
59 for len(p.in) > 0 {
60 v, err := p.unmarshalValue()
Damien Neil8c86fc52019-06-19 09:28:29 -070061 if err != nil {
Joe Tsai27c2a762018-08-01 16:48:18 -070062 return Value{}, err
63 }
64 elems = append(elems, v)
65 if !p.tryConsumeChar(',') {
66 break
67 }
68 }
69 }
70 if err := p.consumeChar(']', "at end of list"); err != nil {
71 return Value{}, err
72 }
73 b = b[:len(b)-len(p.in)]
74 return rawValueOf(elems, b[:len(b):len(b)]), nil
75}
76
77func (p *decoder) unmarshalMessage(checkDelims bool) (Value, error) {
78 b := p.in
79 var items [][2]Value
80 delims := [2]byte{'{', '}'}
81 if len(p.in) > 0 && p.in[0] == '<' {
82 delims = [2]byte{'<', '>'}
83 }
84 if checkDelims {
85 if err := p.consumeChar(delims[0], "at start of message"); err != nil {
86 return Value{}, err
87 }
88 }
89 for len(p.in) > 0 {
90 if p.in[0] == '}' || p.in[0] == '>' {
91 break
92 }
93 k, err := p.unmarshalKey()
Damien Neil8c86fc52019-06-19 09:28:29 -070094 if err != nil {
Joe Tsai27c2a762018-08-01 16:48:18 -070095 return Value{}, err
96 }
97 if !p.tryConsumeChar(':') && len(p.in) > 0 && p.in[0] != '{' && p.in[0] != '<' {
98 return Value{}, newSyntaxError("expected ':' after message key")
99 }
100 v, err := p.unmarshalValue()
Damien Neil8c86fc52019-06-19 09:28:29 -0700101 if err != nil {
Joe Tsai27c2a762018-08-01 16:48:18 -0700102 return Value{}, err
103 }
104 if p.tryConsumeChar(';') || p.tryConsumeChar(',') {
105 // always optional
106 }
107 items = append(items, [2]Value{k, v})
108 }
109 if checkDelims {
110 if err := p.consumeChar(delims[1], "at end of message"); err != nil {
111 return Value{}, err
112 }
113 }
114 b = b[:len(b)-len(p.in)]
115 return rawValueOf(items, b[:len(b):len(b)]), nil
116}
117
Joe Tsai27c2a762018-08-01 16:48:18 -0700118// unmarshalKey parses the key, which may be a Name, String, or Uint.
119func (p *decoder) unmarshalKey() (v Value, err error) {
120 if p.tryConsumeChar('[') {
121 if len(p.in) == 0 {
122 return Value{}, io.ErrUnexpectedEOF
123 }
124 if p.in[0] == '\'' || p.in[0] == '"' {
125 // Historically, Go's parser allowed a string for the Any type URL.
126 // This is specific to Go and contrary to the C++ implementation,
127 // which does not support strings for the Any type URL.
128 v, err = p.unmarshalString()
Damien Neil8c86fc52019-06-19 09:28:29 -0700129 if err != nil {
Joe Tsai27c2a762018-08-01 16:48:18 -0700130 return Value{}, err
131 }
Joe Tsai27c2a762018-08-01 16:48:18 -0700132 } else {
Herbie Onga3369c52019-04-23 00:24:46 -0700133 v, err = p.unmarshalURL()
134 if err != nil {
135 return Value{}, err
136 }
Joe Tsai27c2a762018-08-01 16:48:18 -0700137 }
138 if err := p.consumeChar(']', "at end of extension name"); err != nil {
139 return Value{}, err
140 }
141 return v, nil
142 }
Herbie Onga3369c52019-04-23 00:24:46 -0700143 v, err = p.unmarshalName()
144 if err == nil {
145 return v, nil
Joe Tsai27c2a762018-08-01 16:48:18 -0700146 }
Herbie Onga3369c52019-04-23 00:24:46 -0700147 v, err = p.unmarshalNumberKey()
148 if err == nil {
149 return v, nil
150 }
151 return Value{}, err
152}
153
154// unmarshalURL parses an Any type URL string. The C++ parser does not handle
155// many legal URL strings. This implementation is more liberal and allows for
156// the pattern ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`).
157func (p *decoder) unmarshalURL() (Value, error) {
158 s := p.in
159 var size int
160 for len(s) > 0 && (s[0] == '-' || s[0] == '_' ||
161 ('0' <= s[0] && s[0] <= '9') ||
162 ('a' <= s[0] && s[0] <= 'z') ||
163 ('A' <= s[0] && s[0] <= 'Z')) {
164 s = s[1:]
165 size++
166 if len(s) > 0 && (s[0] == '/' || s[0] == '.') {
167 s = s[1:]
168 size++
169 }
170 }
171
172 // Last character cannot be '.' or '/'.
173 // Next byte should either be a delimiter or it is at the end.
174 if size == 0 || p.in[size-1] == '.' || p.in[size-1] == '/' ||
175 (len(s) > 0 && !isDelim(s[0])) {
176 return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
177 }
178 v := rawValueOf(string(p.in[:size]), p.in[:size:size])
179 p.consume(size)
180 return v, nil
181}
182
183// unmarshalNumberKey parses field number as key. Field numbers are non-negative
184// integers.
185func (p *decoder) unmarshalNumberKey() (Value, error) {
186 num, ok := parseNumber(p.in)
187 if !ok || num.neg || num.typ == numFloat {
188 return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
189 }
190 v, err := strconv.ParseUint(string(num.value), 0, 64)
191 if err != nil {
192 return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
193 }
194 p.consume(num.size)
195 return rawValueOf(v, num.value), nil
Joe Tsai27c2a762018-08-01 16:48:18 -0700196}
197
198func (p *decoder) unmarshalValue() (Value, error) {
199 if len(p.in) == 0 {
200 return Value{}, io.ErrUnexpectedEOF
201 }
202 switch p.in[0] {
203 case '"', '\'':
204 return p.unmarshalStrings()
205 case '[':
206 return p.unmarshalList()
207 case '{', '<':
208 return p.unmarshalMessage(true)
209 default:
Herbie Onga3369c52019-04-23 00:24:46 -0700210 n, ok := consumeName(p.in)
211 if ok && literals[string(p.in[:n])] == nil {
212 v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
213 p.consume(n)
214 return v, nil
Joe Tsai27c2a762018-08-01 16:48:18 -0700215 }
216 return p.unmarshalNumber()
217 }
218}
219
Herbie Onga3369c52019-04-23 00:24:46 -0700220// unmarshalName unmarshals an unquoted proto identifier.
221// Regular expression that matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*`
Joe Tsai27c2a762018-08-01 16:48:18 -0700222//
223// E.g., `field_name` => ValueOf(protoreflect.Name("field_name"))
224func (p *decoder) unmarshalName() (Value, error) {
Herbie Onga3369c52019-04-23 00:24:46 -0700225 n, ok := consumeName(p.in)
226 if !ok {
227 return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
Joe Tsai27c2a762018-08-01 16:48:18 -0700228 }
Herbie Onga3369c52019-04-23 00:24:46 -0700229
230 v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
231 p.consume(n)
232 return v, nil
233}
234
235func consumeName(input []byte) (int, bool) {
236 var n int
237
238 s := input
239 if len(s) == 0 {
240 return 0, false
241 }
242
243 switch {
244 case s[0] == '_',
245 'a' <= s[0] && s[0] <= 'z',
246 'A' <= s[0] && s[0] <= 'Z':
247 s = s[1:]
248 n++
249 default:
250 return 0, false
251 }
252
253 for len(s) > 0 && (s[0] == '_' ||
254 'a' <= s[0] && s[0] <= 'z' ||
255 'A' <= s[0] && s[0] <= 'Z' ||
256 '0' <= s[0] && s[0] <= '9') {
257 s = s[1:]
258 n++
259 }
260
261 if len(s) > 0 && !isDelim(s[0]) {
262 return 0, false
263 }
264
265 return n, true
Joe Tsai27c2a762018-08-01 16:48:18 -0700266}
267
268func (p *decoder) consumeChar(c byte, msg string) error {
269 if p.tryConsumeChar(c) {
270 return nil
271 }
272 if len(p.in) == 0 {
273 return io.ErrUnexpectedEOF
274 }
275 return newSyntaxError("invalid character %q, expected %q %s", p.in[0], c, msg)
276}
277
278func (p *decoder) tryConsumeChar(c byte) bool {
279 if len(p.in) > 0 && p.in[0] == c {
280 p.consume(1)
281 return true
282 }
283 return false
284}
285
286// consume consumes n bytes of input and any subsequent whitespace or comments.
287func (p *decoder) consume(n int) {
288 p.in = p.in[n:]
289 for len(p.in) > 0 {
290 switch p.in[0] {
291 case ' ', '\n', '\r', '\t':
292 p.in = p.in[1:]
293 case '#':
294 if i := bytes.IndexByte(p.in, '\n'); i >= 0 {
295 p.in = p.in[i+len("\n"):]
296 } else {
297 p.in = nil
298 }
299 default:
300 return
301 }
302 }
303}
304
305// Any sequence that looks like a non-delimiter (for error reporting).
Herbie Onga3369c52019-04-23 00:24:46 -0700306var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9\/]+|.)`)
Joe Tsai27c2a762018-08-01 16:48:18 -0700307
Herbie Onga3369c52019-04-23 00:24:46 -0700308// isDelim returns true if given byte is a delimiter character.
309func isDelim(c byte) bool {
310 return !(c == '-' || c == '+' || c == '.' || c == '_' ||
311 ('a' <= c && c <= 'z') ||
312 ('A' <= c && c <= 'Z') ||
313 ('0' <= c && c <= '9'))
Joe Tsai27c2a762018-08-01 16:48:18 -0700314}