blob: 6792e7a912e28877c0fb2060b62fb0bd63b43ed7 [file] [log] [blame]
Joe Tsai27c2a762018-08-01 16:48:18 -07001// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package text
6
7import (
8 "bytes"
9 "io"
10 "math"
11 "math/bits"
12 "strconv"
13 "strings"
14 "unicode"
15 "unicode/utf16"
16 "unicode/utf8"
17
Damien Neile89e6242019-05-13 23:55:40 -070018 "google.golang.org/protobuf/internal/errors"
Joe Tsai27c2a762018-08-01 16:48:18 -070019)
20
21func (p *encoder) marshalString(v Value) error {
22 var err error
23 p.out, err = appendString(p.out, v, p.outputASCII)
24 return err
25}
26func appendString(out []byte, v Value, outputASCII bool) ([]byte, error) {
27 if v.Type() != String {
28 return nil, errors.New("invalid type %v, expected string", v.Type())
29 }
30 if len(v.raw) > 0 {
31 return append(out, v.raw...), nil
32 }
33 in := v.String()
34
35 out = append(out, '"')
Herbie Ong1e096912019-04-16 00:14:03 -070036 i := indexNeedEscapeInString(in)
Joe Tsai27c2a762018-08-01 16:48:18 -070037 in, out = in[i:], append(out, in[:i]...)
38 for len(in) > 0 {
39 switch r, n := utf8.DecodeRuneInString(in); {
40 case r == utf8.RuneError && n == 1:
41 // We do not report invalid UTF-8 because strings in the text format
42 // are used to represent both the proto string and bytes type.
43 r = rune(in[0])
44 fallthrough
45 case r < ' ' || r == '"' || r == '\\':
46 out = append(out, '\\')
47 switch r {
48 case '"', '\\':
49 out = append(out, byte(r))
50 case '\n':
51 out = append(out, 'n')
52 case '\r':
53 out = append(out, 'r')
54 case '\t':
55 out = append(out, 't')
56 default:
57 out = append(out, 'x')
58 out = append(out, "00"[1+(bits.Len32(uint32(r))-1)/4:]...)
59 out = strconv.AppendUint(out, uint64(r), 16)
60 }
61 in = in[n:]
62 case outputASCII && r >= utf8.RuneSelf:
63 out = append(out, '\\')
64 if r <= math.MaxUint16 {
65 out = append(out, 'u')
66 out = append(out, "0000"[1+(bits.Len32(uint32(r))-1)/4:]...)
67 out = strconv.AppendUint(out, uint64(r), 16)
68 } else {
69 out = append(out, 'U')
70 out = append(out, "00000000"[1+(bits.Len32(uint32(r))-1)/4:]...)
71 out = strconv.AppendUint(out, uint64(r), 16)
72 }
73 in = in[n:]
74 default:
Herbie Ong1e096912019-04-16 00:14:03 -070075 i := indexNeedEscapeInString(in[n:])
Joe Tsai27c2a762018-08-01 16:48:18 -070076 in, out = in[n+i:], append(out, in[:n+i]...)
77 }
78 }
79 out = append(out, '"')
80 return out, nil
81}
82
83func (p *decoder) unmarshalString() (Value, error) {
84 v, n, err := consumeString(p.in)
85 p.consume(n)
86 return v, err
87}
88func consumeString(in []byte) (Value, int, error) {
Joe Tsai27c2a762018-08-01 16:48:18 -070089 in0 := in
90 if len(in) == 0 {
91 return Value{}, 0, io.ErrUnexpectedEOF
92 }
93 quote := in[0]
94 if in[0] != '"' && in[0] != '\'' {
95 return Value{}, 0, newSyntaxError("invalid character %q at start of string", in[0])
96 }
97 in = in[1:]
Herbie Ong1e096912019-04-16 00:14:03 -070098 i := indexNeedEscapeInBytes(in)
Joe Tsai27c2a762018-08-01 16:48:18 -070099 in, out := in[i:], in[:i:i] // set cap to prevent mutations
100 for len(in) > 0 {
101 switch r, n := utf8.DecodeRune(in); {
102 case r == utf8.RuneError && n == 1:
Damien Neil8c86fc52019-06-19 09:28:29 -0700103 return Value{}, 0, newSyntaxError("invalid UTF-8 detected")
Joe Tsai27c2a762018-08-01 16:48:18 -0700104 case r == 0 || r == '\n':
105 return Value{}, 0, newSyntaxError("invalid character %q in string", r)
106 case r == rune(quote):
107 in = in[1:]
108 n := len(in0) - len(in)
109 v := rawValueOf(string(out), in0[:n:n])
Damien Neil8c86fc52019-06-19 09:28:29 -0700110 return v, n, nil
Joe Tsai27c2a762018-08-01 16:48:18 -0700111 case r == '\\':
112 if len(in) < 2 {
113 return Value{}, 0, io.ErrUnexpectedEOF
114 }
115 switch r := in[1]; r {
116 case '"', '\'', '\\', '?':
117 in, out = in[2:], append(out, r)
118 case 'a':
119 in, out = in[2:], append(out, '\a')
120 case 'b':
121 in, out = in[2:], append(out, '\b')
122 case 'n':
123 in, out = in[2:], append(out, '\n')
124 case 'r':
125 in, out = in[2:], append(out, '\r')
126 case 't':
127 in, out = in[2:], append(out, '\t')
128 case 'v':
129 in, out = in[2:], append(out, '\v')
130 case 'f':
131 in, out = in[2:], append(out, '\f')
132 case '0', '1', '2', '3', '4', '5', '6', '7':
133 // One, two, or three octal characters.
134 n := len(in[1:]) - len(bytes.TrimLeft(in[1:], "01234567"))
135 if n > 3 {
136 n = 3
137 }
138 v, err := strconv.ParseUint(string(in[1:1+n]), 8, 8)
139 if err != nil {
140 return Value{}, 0, newSyntaxError("invalid octal escape code %q in string", in[:1+n])
141 }
142 in, out = in[1+n:], append(out, byte(v))
143 case 'x':
144 // One or two hexadecimal characters.
145 n := len(in[2:]) - len(bytes.TrimLeft(in[2:], "0123456789abcdefABCDEF"))
146 if n > 2 {
147 n = 2
148 }
149 v, err := strconv.ParseUint(string(in[2:2+n]), 16, 8)
150 if err != nil {
151 return Value{}, 0, newSyntaxError("invalid hex escape code %q in string", in[:2+n])
152 }
153 in, out = in[2+n:], append(out, byte(v))
154 case 'u', 'U':
155 // Four or eight hexadecimal characters
156 n := 6
157 if r == 'U' {
158 n = 10
159 }
160 if len(in) < n {
161 return Value{}, 0, io.ErrUnexpectedEOF
162 }
163 v, err := strconv.ParseUint(string(in[2:n]), 16, 32)
164 if utf8.MaxRune < v || err != nil {
165 return Value{}, 0, newSyntaxError("invalid Unicode escape code %q in string", in[:n])
166 }
167 in = in[n:]
168
169 r := rune(v)
170 if utf16.IsSurrogate(r) {
171 if len(in) < 6 {
172 return Value{}, 0, io.ErrUnexpectedEOF
173 }
174 v, err := strconv.ParseUint(string(in[2:6]), 16, 16)
175 r = utf16.DecodeRune(r, rune(v))
176 if in[0] != '\\' || in[1] != 'u' || r == unicode.ReplacementChar || err != nil {
177 return Value{}, 0, newSyntaxError("invalid Unicode escape code %q in string", in[:6])
178 }
179 in = in[6:]
180 }
181 out = append(out, string(r)...)
182 default:
183 return Value{}, 0, newSyntaxError("invalid escape code %q in string", in[:2])
184 }
185 default:
Herbie Ong1e096912019-04-16 00:14:03 -0700186 i := indexNeedEscapeInBytes(in[n:])
Joe Tsai27c2a762018-08-01 16:48:18 -0700187 in, out = in[n+i:], append(out, in[:n+i]...)
188 }
189 }
190 return Value{}, 0, io.ErrUnexpectedEOF
191}
192
193// unmarshalStrings unmarshals multiple strings.
194// This differs from unmarshalString since the text format allows
195// multiple back-to-back string literals where they are semantically treated
196// as a single large string with all values concatenated.
197//
198// E.g., `"foo" "bar" "baz"` => ValueOf("foobarbaz")
199func (p *decoder) unmarshalStrings() (Value, error) {
200 // Note that the ending quote is sufficient to unambiguously mark the end
201 // of a string. Thus, the text grammar does not require intervening
202 // whitespace or control characters in-between strings.
203 // Thus, the following is valid:
204 // `"foo"'bar'"baz"` => ValueOf("foobarbaz")
205 b := p.in
206 var ss []string
207 for len(p.in) > 0 && (p.in[0] == '"' || p.in[0] == '\'') {
208 v, err := p.unmarshalString()
Damien Neil8c86fc52019-06-19 09:28:29 -0700209 if err != nil {
Joe Tsai27c2a762018-08-01 16:48:18 -0700210 return Value{}, err
211 }
212 ss = append(ss, v.String())
213 }
214 b = b[:len(b)-len(p.in)]
215 return rawValueOf(strings.Join(ss, ""), b[:len(b):len(b)]), nil
216}
217
Herbie Ong1e096912019-04-16 00:14:03 -0700218// indexNeedEscapeInString returns the index of the character that needs
219// escaping. If no characters need escaping, this returns the input length.
220func indexNeedEscapeInString(s string) int {
Joe Tsai27c2a762018-08-01 16:48:18 -0700221 for i := 0; i < len(s); i++ {
222 if c := s[i]; c < ' ' || c == '"' || c == '\'' || c == '\\' || c >= utf8.RuneSelf {
223 return i
224 }
225 }
226 return len(s)
227}
Herbie Ong1e096912019-04-16 00:14:03 -0700228
229// indexNeedEscapeInBytes returns the index of the character that needs
230// escaping. If no characters need escaping, this returns the input length.
231// TODO: Remove this duplicate function when https://golang.org/issue/31506 gets
232// resolved.
233func indexNeedEscapeInBytes(b []byte) int {
234 for i := 0; i < len(b); {
235 c, size := utf8.DecodeRune(b[i:])
236 if c < ' ' || c == '"' || c == '\'' || c == '\\' || c >= utf8.RuneSelf {
237 return i
238 }
239 i += size
240 }
241 return len(b)
242}