blob: 0babddf6729fea0e2f3fae283d4dabb4251d2992 [file] [log] [blame]
Joe Tsai27c2a762018-08-01 16:48:18 -07001// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package text
6
7import (
8 "bytes"
9 "io"
10 "regexp"
11 "unicode/utf8"
12
Damien Neile89e6242019-05-13 23:55:40 -070013 "google.golang.org/protobuf/internal/errors"
14 "google.golang.org/protobuf/reflect/protoreflect"
Joe Tsai27c2a762018-08-01 16:48:18 -070015)
16
17type syntaxError struct{ error }
18
19func newSyntaxError(f string, x ...interface{}) error {
20 return syntaxError{errors.New(f, x...)}
21}
22
23// Unmarshal parses b as the proto text format.
24// It returns a Value, which is always of the Message type.
25func Unmarshal(b []byte) (Value, error) {
26 p := decoder{in: b}
27 p.consume(0) // trim leading spaces or comments
28 v, err := p.unmarshalMessage(false)
29 if !p.nerr.Merge(err) {
30 if e, ok := err.(syntaxError); ok {
31 b = b[:len(b)-len(p.in)] // consumed input
32 line := bytes.Count(b, []byte("\n")) + 1
33 if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
34 b = b[i+1:]
35 }
36 column := utf8.RuneCount(b) + 1 // ignore multi-rune characters
37 err = errors.New("syntax error (line %d:%d): %v", line, column, e.error)
38 }
39 return Value{}, err
40 }
41 if len(p.in) > 0 {
42 return Value{}, errors.New("%d bytes of unconsumed input", len(p.in))
43 }
44 return v, p.nerr.E
45}
46
47type decoder struct {
48 nerr errors.NonFatal
49 in []byte
50}
51
52func (p *decoder) unmarshalList() (Value, error) {
53 b := p.in
54 var elems []Value
55 if err := p.consumeChar('[', "at start of list"); err != nil {
56 return Value{}, err
57 }
58 if len(p.in) > 0 && p.in[0] != ']' {
59 for len(p.in) > 0 {
60 v, err := p.unmarshalValue()
61 if !p.nerr.Merge(err) {
62 return Value{}, err
63 }
64 elems = append(elems, v)
65 if !p.tryConsumeChar(',') {
66 break
67 }
68 }
69 }
70 if err := p.consumeChar(']', "at end of list"); err != nil {
71 return Value{}, err
72 }
73 b = b[:len(b)-len(p.in)]
74 return rawValueOf(elems, b[:len(b):len(b)]), nil
75}
76
77func (p *decoder) unmarshalMessage(checkDelims bool) (Value, error) {
78 b := p.in
79 var items [][2]Value
80 delims := [2]byte{'{', '}'}
81 if len(p.in) > 0 && p.in[0] == '<' {
82 delims = [2]byte{'<', '>'}
83 }
84 if checkDelims {
85 if err := p.consumeChar(delims[0], "at start of message"); err != nil {
86 return Value{}, err
87 }
88 }
89 for len(p.in) > 0 {
90 if p.in[0] == '}' || p.in[0] == '>' {
91 break
92 }
93 k, err := p.unmarshalKey()
94 if !p.nerr.Merge(err) {
95 return Value{}, err
96 }
97 if !p.tryConsumeChar(':') && len(p.in) > 0 && p.in[0] != '{' && p.in[0] != '<' {
98 return Value{}, newSyntaxError("expected ':' after message key")
99 }
100 v, err := p.unmarshalValue()
101 if !p.nerr.Merge(err) {
102 return Value{}, err
103 }
104 if p.tryConsumeChar(';') || p.tryConsumeChar(',') {
105 // always optional
106 }
107 items = append(items, [2]Value{k, v})
108 }
109 if checkDelims {
110 if err := p.consumeChar(delims[1], "at end of message"); err != nil {
111 return Value{}, err
112 }
113 }
114 b = b[:len(b)-len(p.in)]
115 return rawValueOf(items, b[:len(b):len(b)]), nil
116}
117
118// This expression is more liberal than ConsumeAnyTypeUrl in C++.
119// However, the C++ parser does not handle many legal URL strings.
120// The Go implementation is more liberal to be backwards compatible with
121// the historical Go implementation which was overly liberal (and buggy).
122var urlRegexp = regexp.MustCompile(`^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`)
123
124// unmarshalKey parses the key, which may be a Name, String, or Uint.
125func (p *decoder) unmarshalKey() (v Value, err error) {
126 if p.tryConsumeChar('[') {
127 if len(p.in) == 0 {
128 return Value{}, io.ErrUnexpectedEOF
129 }
130 if p.in[0] == '\'' || p.in[0] == '"' {
131 // Historically, Go's parser allowed a string for the Any type URL.
132 // This is specific to Go and contrary to the C++ implementation,
133 // which does not support strings for the Any type URL.
134 v, err = p.unmarshalString()
135 if !p.nerr.Merge(err) {
136 return Value{}, err
137 }
138 } else if n := matchWithDelim(urlRegexp, p.in); n > 0 {
139 v = rawValueOf(string(p.in[:n]), p.in[:n:n])
140 p.consume(n)
141 } else {
142 return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
143 }
144 if err := p.consumeChar(']', "at end of extension name"); err != nil {
145 return Value{}, err
146 }
147 return v, nil
148 }
149 if matchWithDelim(intRegexp, p.in) > 0 && p.in[0] != '-' {
150 return p.unmarshalNumber()
151 }
152 return p.unmarshalName()
153}
154
155func (p *decoder) unmarshalValue() (Value, error) {
156 if len(p.in) == 0 {
157 return Value{}, io.ErrUnexpectedEOF
158 }
159 switch p.in[0] {
160 case '"', '\'':
161 return p.unmarshalStrings()
162 case '[':
163 return p.unmarshalList()
164 case '{', '<':
165 return p.unmarshalMessage(true)
166 default:
167 n := matchWithDelim(nameRegexp, p.in) // zero if no match
168 if n > 0 && literals[string(p.in[:n])] == nil {
169 return p.unmarshalName()
170 }
171 return p.unmarshalNumber()
172 }
173}
174
175// This expression matches all valid proto identifiers.
176var nameRegexp = regexp.MustCompile(`^[_a-zA-Z][_a-zA-Z0-9]*`)
177
178// unmarshalName unmarshals an unquoted identifier.
179//
180// E.g., `field_name` => ValueOf(protoreflect.Name("field_name"))
181func (p *decoder) unmarshalName() (Value, error) {
182 if n := matchWithDelim(nameRegexp, p.in); n > 0 {
183 v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
184 p.consume(n)
185 return v, nil
186 }
187 return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
188}
189
190func (p *decoder) consumeChar(c byte, msg string) error {
191 if p.tryConsumeChar(c) {
192 return nil
193 }
194 if len(p.in) == 0 {
195 return io.ErrUnexpectedEOF
196 }
197 return newSyntaxError("invalid character %q, expected %q %s", p.in[0], c, msg)
198}
199
200func (p *decoder) tryConsumeChar(c byte) bool {
201 if len(p.in) > 0 && p.in[0] == c {
202 p.consume(1)
203 return true
204 }
205 return false
206}
207
208// consume consumes n bytes of input and any subsequent whitespace or comments.
209func (p *decoder) consume(n int) {
210 p.in = p.in[n:]
211 for len(p.in) > 0 {
212 switch p.in[0] {
213 case ' ', '\n', '\r', '\t':
214 p.in = p.in[1:]
215 case '#':
216 if i := bytes.IndexByte(p.in, '\n'); i >= 0 {
217 p.in = p.in[i+len("\n"):]
218 } else {
219 p.in = nil
220 }
221 default:
222 return
223 }
224 }
225}
226
227// Any sequence that looks like a non-delimiter (for error reporting).
228var errRegexp = regexp.MustCompile("^([-+._a-zA-Z0-9]{1,32}|.)")
229
230// matchWithDelim matches r with the input b and verifies that the match
231// terminates with a delimiter of some form (e.g., r"[^-+_.a-zA-Z0-9]").
232// As a special case, EOF is considered a delimiter.
233func matchWithDelim(r *regexp.Regexp, b []byte) int {
234 n := len(r.Find(b))
235 if n < len(b) {
236 // Check that that the next character is a delimiter.
237 c := b[n]
238 notDelim := (c == '-' || c == '+' || c == '.' || c == '_' ||
239 ('a' <= c && c <= 'z') ||
240 ('A' <= c && c <= 'Z') ||
241 ('0' <= c && c <= '9'))
242 if notDelim {
243 return 0
244 }
245 }
246 return n
247}