blob: 2b32ed9ee35e879c7b2a4e4cb84311eee8463a85 [file] [log] [blame]
Joe Tsai27c2a762018-08-01 16:48:18 -07001// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package text
6
7import (
8 "bytes"
9 "io"
10 "regexp"
11 "unicode/utf8"
12
Damien Neile89e6242019-05-13 23:55:40 -070013 "google.golang.org/protobuf/internal/errors"
14 "google.golang.org/protobuf/reflect/protoreflect"
Joe Tsai27c2a762018-08-01 16:48:18 -070015)
16
17type syntaxError struct{ error }
18
19func newSyntaxError(f string, x ...interface{}) error {
20 return syntaxError{errors.New(f, x...)}
21}
22
23// Unmarshal parses b as the proto text format.
24// It returns a Value, which is always of the Message type.
25func Unmarshal(b []byte) (Value, error) {
26 p := decoder{in: b}
27 p.consume(0) // trim leading spaces or comments
28 v, err := p.unmarshalMessage(false)
Damien Neil8c86fc52019-06-19 09:28:29 -070029 if err != nil {
Joe Tsai27c2a762018-08-01 16:48:18 -070030 if e, ok := err.(syntaxError); ok {
31 b = b[:len(b)-len(p.in)] // consumed input
32 line := bytes.Count(b, []byte("\n")) + 1
33 if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
34 b = b[i+1:]
35 }
36 column := utf8.RuneCount(b) + 1 // ignore multi-rune characters
37 err = errors.New("syntax error (line %d:%d): %v", line, column, e.error)
38 }
39 return Value{}, err
40 }
41 if len(p.in) > 0 {
42 return Value{}, errors.New("%d bytes of unconsumed input", len(p.in))
43 }
Damien Neil8c86fc52019-06-19 09:28:29 -070044 return v, nil
Joe Tsai27c2a762018-08-01 16:48:18 -070045}
46
47type decoder struct {
Damien Neil8c86fc52019-06-19 09:28:29 -070048 in []byte
Joe Tsai27c2a762018-08-01 16:48:18 -070049}
50
51func (p *decoder) unmarshalList() (Value, error) {
52 b := p.in
53 var elems []Value
54 if err := p.consumeChar('[', "at start of list"); err != nil {
55 return Value{}, err
56 }
57 if len(p.in) > 0 && p.in[0] != ']' {
58 for len(p.in) > 0 {
59 v, err := p.unmarshalValue()
Damien Neil8c86fc52019-06-19 09:28:29 -070060 if err != nil {
Joe Tsai27c2a762018-08-01 16:48:18 -070061 return Value{}, err
62 }
63 elems = append(elems, v)
64 if !p.tryConsumeChar(',') {
65 break
66 }
67 }
68 }
69 if err := p.consumeChar(']', "at end of list"); err != nil {
70 return Value{}, err
71 }
72 b = b[:len(b)-len(p.in)]
73 return rawValueOf(elems, b[:len(b):len(b)]), nil
74}
75
76func (p *decoder) unmarshalMessage(checkDelims bool) (Value, error) {
77 b := p.in
78 var items [][2]Value
79 delims := [2]byte{'{', '}'}
80 if len(p.in) > 0 && p.in[0] == '<' {
81 delims = [2]byte{'<', '>'}
82 }
83 if checkDelims {
84 if err := p.consumeChar(delims[0], "at start of message"); err != nil {
85 return Value{}, err
86 }
87 }
88 for len(p.in) > 0 {
89 if p.in[0] == '}' || p.in[0] == '>' {
90 break
91 }
92 k, err := p.unmarshalKey()
Damien Neil8c86fc52019-06-19 09:28:29 -070093 if err != nil {
Joe Tsai27c2a762018-08-01 16:48:18 -070094 return Value{}, err
95 }
96 if !p.tryConsumeChar(':') && len(p.in) > 0 && p.in[0] != '{' && p.in[0] != '<' {
97 return Value{}, newSyntaxError("expected ':' after message key")
98 }
99 v, err := p.unmarshalValue()
Damien Neil8c86fc52019-06-19 09:28:29 -0700100 if err != nil {
Joe Tsai27c2a762018-08-01 16:48:18 -0700101 return Value{}, err
102 }
103 if p.tryConsumeChar(';') || p.tryConsumeChar(',') {
104 // always optional
105 }
106 items = append(items, [2]Value{k, v})
107 }
108 if checkDelims {
109 if err := p.consumeChar(delims[1], "at end of message"); err != nil {
110 return Value{}, err
111 }
112 }
113 b = b[:len(b)-len(p.in)]
114 return rawValueOf(items, b[:len(b):len(b)]), nil
115}
116
117// This expression is more liberal than ConsumeAnyTypeUrl in C++.
118// However, the C++ parser does not handle many legal URL strings.
119// The Go implementation is more liberal to be backwards compatible with
120// the historical Go implementation which was overly liberal (and buggy).
121var urlRegexp = regexp.MustCompile(`^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`)
122
123// unmarshalKey parses the key, which may be a Name, String, or Uint.
124func (p *decoder) unmarshalKey() (v Value, err error) {
125 if p.tryConsumeChar('[') {
126 if len(p.in) == 0 {
127 return Value{}, io.ErrUnexpectedEOF
128 }
129 if p.in[0] == '\'' || p.in[0] == '"' {
130 // Historically, Go's parser allowed a string for the Any type URL.
131 // This is specific to Go and contrary to the C++ implementation,
132 // which does not support strings for the Any type URL.
133 v, err = p.unmarshalString()
Damien Neil8c86fc52019-06-19 09:28:29 -0700134 if err != nil {
Joe Tsai27c2a762018-08-01 16:48:18 -0700135 return Value{}, err
136 }
137 } else if n := matchWithDelim(urlRegexp, p.in); n > 0 {
138 v = rawValueOf(string(p.in[:n]), p.in[:n:n])
139 p.consume(n)
140 } else {
141 return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
142 }
143 if err := p.consumeChar(']', "at end of extension name"); err != nil {
144 return Value{}, err
145 }
146 return v, nil
147 }
148 if matchWithDelim(intRegexp, p.in) > 0 && p.in[0] != '-' {
149 return p.unmarshalNumber()
150 }
151 return p.unmarshalName()
152}
153
154func (p *decoder) unmarshalValue() (Value, error) {
155 if len(p.in) == 0 {
156 return Value{}, io.ErrUnexpectedEOF
157 }
158 switch p.in[0] {
159 case '"', '\'':
160 return p.unmarshalStrings()
161 case '[':
162 return p.unmarshalList()
163 case '{', '<':
164 return p.unmarshalMessage(true)
165 default:
166 n := matchWithDelim(nameRegexp, p.in) // zero if no match
167 if n > 0 && literals[string(p.in[:n])] == nil {
168 return p.unmarshalName()
169 }
170 return p.unmarshalNumber()
171 }
172}
173
174// This expression matches all valid proto identifiers.
175var nameRegexp = regexp.MustCompile(`^[_a-zA-Z][_a-zA-Z0-9]*`)
176
177// unmarshalName unmarshals an unquoted identifier.
178//
179// E.g., `field_name` => ValueOf(protoreflect.Name("field_name"))
180func (p *decoder) unmarshalName() (Value, error) {
181 if n := matchWithDelim(nameRegexp, p.in); n > 0 {
182 v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
183 p.consume(n)
184 return v, nil
185 }
186 return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
187}
188
189func (p *decoder) consumeChar(c byte, msg string) error {
190 if p.tryConsumeChar(c) {
191 return nil
192 }
193 if len(p.in) == 0 {
194 return io.ErrUnexpectedEOF
195 }
196 return newSyntaxError("invalid character %q, expected %q %s", p.in[0], c, msg)
197}
198
199func (p *decoder) tryConsumeChar(c byte) bool {
200 if len(p.in) > 0 && p.in[0] == c {
201 p.consume(1)
202 return true
203 }
204 return false
205}
206
207// consume consumes n bytes of input and any subsequent whitespace or comments.
208func (p *decoder) consume(n int) {
209 p.in = p.in[n:]
210 for len(p.in) > 0 {
211 switch p.in[0] {
212 case ' ', '\n', '\r', '\t':
213 p.in = p.in[1:]
214 case '#':
215 if i := bytes.IndexByte(p.in, '\n'); i >= 0 {
216 p.in = p.in[i+len("\n"):]
217 } else {
218 p.in = nil
219 }
220 default:
221 return
222 }
223 }
224}
225
226// Any sequence that looks like a non-delimiter (for error reporting).
227var errRegexp = regexp.MustCompile("^([-+._a-zA-Z0-9]{1,32}|.)")
228
229// matchWithDelim matches r with the input b and verifies that the match
230// terminates with a delimiter of some form (e.g., r"[^-+_.a-zA-Z0-9]").
231// As a special case, EOF is considered a delimiter.
232func matchWithDelim(r *regexp.Regexp, b []byte) int {
233 n := len(r.Find(b))
234 if n < len(b) {
235 // Check that that the next character is a delimiter.
236 c := b[n]
237 notDelim := (c == '-' || c == '+' || c == '.' || c == '_' ||
238 ('a' <= c && c <= 'z') ||
239 ('A' <= c && c <= 'Z') ||
240 ('0' <= c && c <= '9'))
241 if notDelim {
242 return 0
243 }
244 }
245 return n
246}