Blame - internal/encoding/text/decode.go - platform/external/golang-protobuf

blob: 0babddf6729fea0e2f3fae283d4dabb4251d2992 [file] [log] [blame]

Joe Tsai	27c2a76	2018-08-01 16:48:18 -0700	[diff] [blame]	1	// Copyright 2018 The Go Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	package text
				6
				7	import (
				8	"bytes"
				9	"io"
				10	"regexp"
				11	"unicode/utf8"
				12
Damien Neil	e89e624	2019-05-13 23:55:40 -0700	[diff] [blame^]	13	"google.golang.org/protobuf/internal/errors"
				14	"google.golang.org/protobuf/reflect/protoreflect"
Joe Tsai	27c2a76	2018-08-01 16:48:18 -0700	[diff] [blame]	15	)
				16
				17	type syntaxError struct{ error }
				18
				19	func newSyntaxError(f string, x ...interface{}) error {
				20	return syntaxError{errors.New(f, x...)}
				21	}
				22
				23	// Unmarshal parses b as the proto text format.
				24	// It returns a Value, which is always of the Message type.
				25	func Unmarshal(b []byte) (Value, error) {
				26	p := decoder{in: b}
				27	p.consume(0) // trim leading spaces or comments
				28	v, err := p.unmarshalMessage(false)
				29	if !p.nerr.Merge(err) {
				30	if e, ok := err.(syntaxError); ok {
				31	b = b[:len(b)-len(p.in)] // consumed input
				32	line := bytes.Count(b, []byte("\n")) + 1
				33	if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
				34	b = b[i+1:]
				35	}
				36	column := utf8.RuneCount(b) + 1 // ignore multi-rune characters
				37	err = errors.New("syntax error (line %d:%d): %v", line, column, e.error)
				38	}
				39	return Value{}, err
				40	}
				41	if len(p.in) > 0 {
				42	return Value{}, errors.New("%d bytes of unconsumed input", len(p.in))
				43	}
				44	return v, p.nerr.E
				45	}
				46
				47	type decoder struct {
				48	nerr errors.NonFatal
				49	in []byte
				50	}
				51
				52	func (p *decoder) unmarshalList() (Value, error) {
				53	b := p.in
				54	var elems []Value
				55	if err := p.consumeChar('[', "at start of list"); err != nil {
				56	return Value{}, err
				57	}
				58	if len(p.in) > 0 && p.in[0] != ']' {
				59	for len(p.in) > 0 {
				60	v, err := p.unmarshalValue()
				61	if !p.nerr.Merge(err) {
				62	return Value{}, err
				63	}
				64	elems = append(elems, v)
				65	if !p.tryConsumeChar(',') {
				66	break
				67	}
				68	}
				69	}
				70	if err := p.consumeChar(']', "at end of list"); err != nil {
				71	return Value{}, err
				72	}
				73	b = b[:len(b)-len(p.in)]
				74	return rawValueOf(elems, b[:len(b):len(b)]), nil
				75	}
				76
				77	func (p *decoder) unmarshalMessage(checkDelims bool) (Value, error) {
				78	b := p.in
				79	var items [][2]Value
				80	delims := [2]byte{'{', '}'}
				81	if len(p.in) > 0 && p.in[0] == '<' {
				82	delims = [2]byte{'<', '>'}
				83	}
				84	if checkDelims {
				85	if err := p.consumeChar(delims[0], "at start of message"); err != nil {
				86	return Value{}, err
				87	}
				88	}
				89	for len(p.in) > 0 {
				90	if p.in[0] == '}' \|\| p.in[0] == '>' {
				91	break
				92	}
				93	k, err := p.unmarshalKey()
				94	if !p.nerr.Merge(err) {
				95	return Value{}, err
				96	}
				97	if !p.tryConsumeChar(':') && len(p.in) > 0 && p.in[0] != '{' && p.in[0] != '<' {
				98	return Value{}, newSyntaxError("expected ':' after message key")
				99	}
				100	v, err := p.unmarshalValue()
				101	if !p.nerr.Merge(err) {
				102	return Value{}, err
				103	}
				104	if p.tryConsumeChar(';') \|\| p.tryConsumeChar(',') {
				105	// always optional
				106	}
				107	items = append(items, [2]Value{k, v})
				108	}
				109	if checkDelims {
				110	if err := p.consumeChar(delims[1], "at end of message"); err != nil {
				111	return Value{}, err
				112	}
				113	}
				114	b = b[:len(b)-len(p.in)]
				115	return rawValueOf(items, b[:len(b):len(b)]), nil
				116	}
				117
				118	// This expression is more liberal than ConsumeAnyTypeUrl in C++.
				119	// However, the C++ parser does not handle many legal URL strings.
				120	// The Go implementation is more liberal to be backwards compatible with
				121	// the historical Go implementation which was overly liberal (and buggy).
				122	var urlRegexp = regexp.MustCompile(`^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`)
				123
				124	// unmarshalKey parses the key, which may be a Name, String, or Uint.
				125	func (p *decoder) unmarshalKey() (v Value, err error) {
				126	if p.tryConsumeChar('[') {
				127	if len(p.in) == 0 {
				128	return Value{}, io.ErrUnexpectedEOF
				129	}
				130	if p.in[0] == '\'' \|\| p.in[0] == '"' {
				131	// Historically, Go's parser allowed a string for the Any type URL.
				132	// This is specific to Go and contrary to the C++ implementation,
				133	// which does not support strings for the Any type URL.
				134	v, err = p.unmarshalString()
				135	if !p.nerr.Merge(err) {
				136	return Value{}, err
				137	}
				138	} else if n := matchWithDelim(urlRegexp, p.in); n > 0 {
				139	v = rawValueOf(string(p.in[:n]), p.in[:n:n])
				140	p.consume(n)
				141	} else {
				142	return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
				143	}
				144	if err := p.consumeChar(']', "at end of extension name"); err != nil {
				145	return Value{}, err
				146	}
				147	return v, nil
				148	}
				149	if matchWithDelim(intRegexp, p.in) > 0 && p.in[0] != '-' {
				150	return p.unmarshalNumber()
				151	}
				152	return p.unmarshalName()
				153	}
				154
				155	func (p *decoder) unmarshalValue() (Value, error) {
				156	if len(p.in) == 0 {
				157	return Value{}, io.ErrUnexpectedEOF
				158	}
				159	switch p.in[0] {
				160	case '"', '\'':
				161	return p.unmarshalStrings()
				162	case '[':
				163	return p.unmarshalList()
				164	case '{', '<':
				165	return p.unmarshalMessage(true)
				166	default:
				167	n := matchWithDelim(nameRegexp, p.in) // zero if no match
				168	if n > 0 && literals[string(p.in[:n])] == nil {
				169	return p.unmarshalName()
				170	}
				171	return p.unmarshalNumber()
				172	}
				173	}
				174
				175	// This expression matches all valid proto identifiers.
				176	var nameRegexp = regexp.MustCompile(`^[_a-zA-Z][_a-zA-Z0-9]*`)
				177
				178	// unmarshalName unmarshals an unquoted identifier.
				179	//
				180	// E.g., `field_name` => ValueOf(protoreflect.Name("field_name"))
				181	func (p *decoder) unmarshalName() (Value, error) {
				182	if n := matchWithDelim(nameRegexp, p.in); n > 0 {
				183	v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
				184	p.consume(n)
				185	return v, nil
				186	}
				187	return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
				188	}
				189
				190	func (p *decoder) consumeChar(c byte, msg string) error {
				191	if p.tryConsumeChar(c) {
				192	return nil
				193	}
				194	if len(p.in) == 0 {
				195	return io.ErrUnexpectedEOF
				196	}
				197	return newSyntaxError("invalid character %q, expected %q %s", p.in[0], c, msg)
				198	}
				199
				200	func (p *decoder) tryConsumeChar(c byte) bool {
				201	if len(p.in) > 0 && p.in[0] == c {
				202	p.consume(1)
				203	return true
				204	}
				205	return false
				206	}
				207
				208	// consume consumes n bytes of input and any subsequent whitespace or comments.
				209	func (p *decoder) consume(n int) {
				210	p.in = p.in[n:]
				211	for len(p.in) > 0 {
				212	switch p.in[0] {
				213	case ' ', '\n', '\r', '\t':
				214	p.in = p.in[1:]
				215	case '#':
				216	if i := bytes.IndexByte(p.in, '\n'); i >= 0 {
				217	p.in = p.in[i+len("\n"):]
				218	} else {
				219	p.in = nil
				220	}
				221	default:
				222	return
				223	}
				224	}
				225	}
				226
				227	// Any sequence that looks like a non-delimiter (for error reporting).
				228	var errRegexp = regexp.MustCompile("^([-+._a-zA-Z0-9]{1,32}\|.)")
				229
				230	// matchWithDelim matches r with the input b and verifies that the match
				231	// terminates with a delimiter of some form (e.g., r"[^-+_.a-zA-Z0-9]").
				232	// As a special case, EOF is considered a delimiter.
				233	func matchWithDelim(r *regexp.Regexp, b []byte) int {
				234	n := len(r.Find(b))
				235	if n < len(b) {
				236	// Check that that the next character is a delimiter.
				237	c := b[n]
				238	notDelim := (c == '-' \|\| c == '+' \|\| c == '.' \|\| c == '_' \|\|
				239	('a' <= c && c <= 'z') \|\|
				240	('A' <= c && c <= 'Z') \|\|
				241	('0' <= c && c <= '9'))
				242	if notDelim {
				243	return 0
				244	}
				245	}
				246	return n
				247	}