blob: 741e106ad7dfbc485b141d5e64dce104a5e013b1 [file] [log] [blame]
Alan Donovan312d1a52017-10-02 10:10:28 -04001// Copyright 2017 The Bazel Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package syntax
6
Alan Donovane3deafe2018-10-23 11:05:09 -04007// Starlark quoted string utilities.
Alan Donovan312d1a52017-10-02 10:10:28 -04008
9import (
Alan Donovan312d1a52017-10-02 10:10:28 -040010 "fmt"
11 "strconv"
12 "strings"
alandonovanebe61bd2021-02-12 16:57:32 -050013 "unicode"
14 "unicode/utf8"
Alan Donovan312d1a52017-10-02 10:10:28 -040015)
16
17// unesc maps single-letter chars following \ to their actual values.
18var unesc = [256]byte{
19 'a': '\a',
20 'b': '\b',
21 'f': '\f',
22 'n': '\n',
23 'r': '\r',
24 't': '\t',
25 'v': '\v',
26 '\\': '\\',
27 '\'': '\'',
28 '"': '"',
29}
30
31// esc maps escape-worthy bytes to the char that should follow \.
32var esc = [256]byte{
33 '\a': 'a',
34 '\b': 'b',
35 '\f': 'f',
36 '\n': 'n',
37 '\r': 'r',
38 '\t': 't',
39 '\v': 'v',
40 '\\': '\\',
41 '\'': '\'',
42 '"': '"',
43}
44
Alan Donovan312d1a52017-10-02 10:10:28 -040045// unquote unquotes the quoted string, returning the actual
alandonovanebe61bd2021-02-12 16:57:32 -050046// string value, whether the original was triple-quoted,
47// whether it was a byte string, and an error describing invalid input.
48func unquote(quoted string) (s string, triple, isByte bool, err error) {
Alan Donovan312d1a52017-10-02 10:10:28 -040049 // Check for raw prefix: means don't interpret the inner \.
50 raw := false
51 if strings.HasPrefix(quoted, "r") {
52 raw = true
53 quoted = quoted[1:]
54 }
alandonovanebe61bd2021-02-12 16:57:32 -050055 // Check for bytes prefix.
56 if strings.HasPrefix(quoted, "b") {
57 isByte = true
58 quoted = quoted[1:]
59 }
Alan Donovan312d1a52017-10-02 10:10:28 -040060
61 if len(quoted) < 2 {
62 err = fmt.Errorf("string literal too short")
63 return
64 }
65
66 if quoted[0] != '"' && quoted[0] != '\'' || quoted[0] != quoted[len(quoted)-1] {
67 err = fmt.Errorf("string literal has invalid quotes")
68 return
69 }
70
71 // Check for triple quoted string.
72 quote := quoted[0]
73 if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] {
74 triple = true
75 quoted = quoted[3 : len(quoted)-3]
76 } else {
77 quoted = quoted[1 : len(quoted)-1]
78 }
79
80 // Now quoted is the quoted data, but no quotes.
81 // If we're in raw mode or there are no escapes or
82 // carriage returns, we're done.
83 var unquoteChars string
84 if raw {
85 unquoteChars = "\r"
86 } else {
87 unquoteChars = "\\\r"
88 }
89 if !strings.ContainsAny(quoted, unquoteChars) {
90 s = quoted
91 return
92 }
93
94 // Otherwise process quoted string.
95 // Each iteration processes one escape sequence along with the
96 // plain text leading up to it.
Josh Bleecher Snyder8cb25c82019-03-01 14:24:35 -080097 buf := new(strings.Builder)
Alan Donovan312d1a52017-10-02 10:10:28 -040098 for {
99 // Remove prefix before escape sequence.
100 i := strings.IndexAny(quoted, unquoteChars)
101 if i < 0 {
102 i = len(quoted)
103 }
104 buf.WriteString(quoted[:i])
105 quoted = quoted[i:]
106
107 if len(quoted) == 0 {
108 break
109 }
110
111 // Process carriage return.
112 if quoted[0] == '\r' {
113 buf.WriteByte('\n')
114 if len(quoted) > 1 && quoted[1] == '\n' {
115 quoted = quoted[2:]
116 } else {
117 quoted = quoted[1:]
118 }
119 continue
120 }
121
122 // Process escape sequence.
123 if len(quoted) == 1 {
124 err = fmt.Errorf(`truncated escape sequence \`)
125 return
126 }
127
128 switch quoted[1] {
129 default:
alandonovan16e44b12020-03-26 10:23:16 -0400130 // In Starlark, like Go, a backslash must escape something.
131 // (Python still treats unnecessary backslashes literally,
132 // but since 3.6 has emitted a deprecation warning.)
133 err = fmt.Errorf("invalid escape sequence \\%c", quoted[1])
134 return
Alan Donovan312d1a52017-10-02 10:10:28 -0400135
136 case '\n':
137 // Ignore the escape and the line break.
138 quoted = quoted[2:]
139
alandonovan2319aeb2020-06-15 13:21:36 -0400140 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"':
alandonovan16e44b12020-03-26 10:23:16 -0400141 // One-char escape.
alandonovan2319aeb2020-06-15 13:21:36 -0400142 // Escapes are allowed for both kinds of quotation
143 // mark, not just the kind in use.
Alan Donovan312d1a52017-10-02 10:10:28 -0400144 buf.WriteByte(unesc[quoted[1]])
145 quoted = quoted[2:]
146
147 case '0', '1', '2', '3', '4', '5', '6', '7':
alandonovanebe61bd2021-02-12 16:57:32 -0500148 // Octal escape, up to 3 digits, \OOO.
Alan Donovan312d1a52017-10-02 10:10:28 -0400149 n := int(quoted[1] - '0')
150 quoted = quoted[2:]
151 for i := 1; i < 3; i++ {
152 if len(quoted) == 0 || quoted[0] < '0' || '7' < quoted[0] {
153 break
154 }
155 n = n*8 + int(quoted[0]-'0')
156 quoted = quoted[1:]
157 }
alandonovanebe61bd2021-02-12 16:57:32 -0500158 if !isByte && n > 127 {
159 err = fmt.Errorf(`non-ASCII octal escape \%o (use \u%04X for the UTF-8 encoding of U+%04X)`, n, n, n)
160 return
161 }
Alan Donovan312d1a52017-10-02 10:10:28 -0400162 if n >= 256 {
163 // NOTE: Python silently discards the high bit,
164 // so that '\541' == '\141' == 'a'.
165 // Let's see if we can avoid doing that in BUILD files.
166 err = fmt.Errorf(`invalid escape sequence \%03o`, n)
167 return
168 }
169 buf.WriteByte(byte(n))
170
171 case 'x':
alandonovanebe61bd2021-02-12 16:57:32 -0500172 // Hexadecimal escape, exactly 2 digits, \xXX. [0-127]
Alan Donovan312d1a52017-10-02 10:10:28 -0400173 if len(quoted) < 4 {
174 err = fmt.Errorf(`truncated escape sequence %s`, quoted)
175 return
176 }
Josh Bleecher Snyder74f6cac2019-01-02 18:36:24 -1000177 n, err1 := strconv.ParseUint(quoted[2:4], 16, 0)
Alan Donovan312d1a52017-10-02 10:10:28 -0400178 if err1 != nil {
179 err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4])
180 return
181 }
alandonovanebe61bd2021-02-12 16:57:32 -0500182 if !isByte && n > 127 {
183 err = fmt.Errorf(`non-ASCII hex escape %s (use \u%04X for the UTF-8 encoding of U+%04X)`,
184 quoted[:4], n, n)
185 return
186 }
Alan Donovan312d1a52017-10-02 10:10:28 -0400187 buf.WriteByte(byte(n))
188 quoted = quoted[4:]
alandonovanebe61bd2021-02-12 16:57:32 -0500189
190 case 'u', 'U':
191 // Unicode code point, 4 (\uXXXX) or 8 (\UXXXXXXXX) hex digits.
192 sz := 6
193 if quoted[1] == 'U' {
194 sz = 10
195 }
196 if len(quoted) < sz {
197 err = fmt.Errorf(`truncated escape sequence %s`, quoted)
198 return
199 }
200 n, err1 := strconv.ParseUint(quoted[2:sz], 16, 0)
201 if err1 != nil {
202 err = fmt.Errorf(`invalid escape sequence %s`, quoted[:sz])
203 return
204 }
205 if n > unicode.MaxRune {
206 err = fmt.Errorf(`code point out of range: %s (max \U%08x)`,
207 quoted[:sz], n)
208 return
209 }
210 // As in Go, surrogates are disallowed.
211 if 0xD800 <= n && n < 0xE000 {
212 err = fmt.Errorf(`invalid Unicode code point U+%04X`, n)
213 return
214 }
215 buf.WriteRune(rune(n))
216 quoted = quoted[sz:]
Alan Donovan312d1a52017-10-02 10:10:28 -0400217 }
218 }
219
220 s = buf.String()
221 return
222}
223
224// indexByte returns the index of the first instance of b in s, or else -1.
225func indexByte(s string, b byte) int {
226 for i := 0; i < len(s); i++ {
227 if s[i] == b {
228 return i
229 }
230 }
231 return -1
232}
233
alandonovanebe61bd2021-02-12 16:57:32 -0500234// Quote returns a Starlark literal that denotes s.
235// If b, it returns a bytes literal.
236func Quote(s string, b bool) string {
237 const hex = "0123456789abcdef"
238 var runeTmp [utf8.UTFMax]byte
Alan Donovan312d1a52017-10-02 10:10:28 -0400239
alandonovanebe61bd2021-02-12 16:57:32 -0500240 buf := make([]byte, 0, 3*len(s)/2)
241 if b {
242 buf = append(buf, 'b')
Alan Donovan312d1a52017-10-02 10:10:28 -0400243 }
alandonovanebe61bd2021-02-12 16:57:32 -0500244 buf = append(buf, '"')
245 for width := 0; len(s) > 0; s = s[width:] {
246 r := rune(s[0])
247 width = 1
248 if r >= utf8.RuneSelf {
249 r, width = utf8.DecodeRuneInString(s)
250 }
251 if width == 1 && r == utf8.RuneError {
252 // String (!b) literals accept \xXX escapes only for ASCII,
253 // but we must use them here to represent invalid bytes.
254 // The result is not a legal literal.
255 buf = append(buf, `\x`...)
256 buf = append(buf, hex[s[0]>>4])
257 buf = append(buf, hex[s[0]&0xF])
258 continue
259 }
260 if r == '"' || r == '\\' { // always backslashed
261 buf = append(buf, '\\')
262 buf = append(buf, byte(r))
263 continue
264 }
265 if strconv.IsPrint(r) {
266 n := utf8.EncodeRune(runeTmp[:], r)
267 buf = append(buf, runeTmp[:n]...)
268 continue
269 }
270 switch r {
271 case '\a':
272 buf = append(buf, `\a`...)
273 case '\b':
274 buf = append(buf, `\b`...)
275 case '\f':
276 buf = append(buf, `\f`...)
277 case '\n':
278 buf = append(buf, `\n`...)
279 case '\r':
280 buf = append(buf, `\r`...)
281 case '\t':
282 buf = append(buf, `\t`...)
283 case '\v':
284 buf = append(buf, `\v`...)
285 default:
286 switch {
287 case r < ' ' || r == 0x7f:
288 buf = append(buf, `\x`...)
289 buf = append(buf, hex[byte(r)>>4])
290 buf = append(buf, hex[byte(r)&0xF])
291 case r > utf8.MaxRune:
292 r = 0xFFFD
293 fallthrough
294 case r < 0x10000:
295 buf = append(buf, `\u`...)
296 for s := 12; s >= 0; s -= 4 {
297 buf = append(buf, hex[r>>uint(s)&0xF])
298 }
299 default:
300 buf = append(buf, `\U`...)
301 for s := 28; s >= 0; s -= 4 {
302 buf = append(buf, hex[r>>uint(s)&0xF])
303 }
Alan Donovan312d1a52017-10-02 10:10:28 -0400304 }
Alan Donovan312d1a52017-10-02 10:10:28 -0400305 }
Alan Donovan312d1a52017-10-02 10:10:28 -0400306 }
alandonovanebe61bd2021-02-12 16:57:32 -0500307 buf = append(buf, '"')
308 return string(buf)
Alan Donovan312d1a52017-10-02 10:10:28 -0400309}