Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 1 | // Copyright 2017 The Bazel Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package syntax |
| 6 | |
Alan Donovan | e3deafe | 2018-10-23 11:05:09 -0400 | [diff] [blame] | 7 | // Starlark quoted string utilities. |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 8 | |
| 9 | import ( |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 10 | "fmt" |
| 11 | "strconv" |
| 12 | "strings" |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 13 | "unicode" |
| 14 | "unicode/utf8" |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 15 | ) |
| 16 | |
| 17 | // unesc maps single-letter chars following \ to their actual values. |
| 18 | var unesc = [256]byte{ |
| 19 | 'a': '\a', |
| 20 | 'b': '\b', |
| 21 | 'f': '\f', |
| 22 | 'n': '\n', |
| 23 | 'r': '\r', |
| 24 | 't': '\t', |
| 25 | 'v': '\v', |
| 26 | '\\': '\\', |
| 27 | '\'': '\'', |
| 28 | '"': '"', |
| 29 | } |
| 30 | |
| 31 | // esc maps escape-worthy bytes to the char that should follow \. |
| 32 | var esc = [256]byte{ |
| 33 | '\a': 'a', |
| 34 | '\b': 'b', |
| 35 | '\f': 'f', |
| 36 | '\n': 'n', |
| 37 | '\r': 'r', |
| 38 | '\t': 't', |
| 39 | '\v': 'v', |
| 40 | '\\': '\\', |
| 41 | '\'': '\'', |
| 42 | '"': '"', |
| 43 | } |
| 44 | |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 45 | // unquote unquotes the quoted string, returning the actual |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 46 | // string value, whether the original was triple-quoted, |
| 47 | // whether it was a byte string, and an error describing invalid input. |
| 48 | func unquote(quoted string) (s string, triple, isByte bool, err error) { |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 49 | // Check for raw prefix: means don't interpret the inner \. |
| 50 | raw := false |
| 51 | if strings.HasPrefix(quoted, "r") { |
| 52 | raw = true |
| 53 | quoted = quoted[1:] |
| 54 | } |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 55 | // Check for bytes prefix. |
| 56 | if strings.HasPrefix(quoted, "b") { |
| 57 | isByte = true |
| 58 | quoted = quoted[1:] |
| 59 | } |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 60 | |
| 61 | if len(quoted) < 2 { |
| 62 | err = fmt.Errorf("string literal too short") |
| 63 | return |
| 64 | } |
| 65 | |
| 66 | if quoted[0] != '"' && quoted[0] != '\'' || quoted[0] != quoted[len(quoted)-1] { |
| 67 | err = fmt.Errorf("string literal has invalid quotes") |
| 68 | return |
| 69 | } |
| 70 | |
| 71 | // Check for triple quoted string. |
| 72 | quote := quoted[0] |
| 73 | if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] { |
| 74 | triple = true |
| 75 | quoted = quoted[3 : len(quoted)-3] |
| 76 | } else { |
| 77 | quoted = quoted[1 : len(quoted)-1] |
| 78 | } |
| 79 | |
| 80 | // Now quoted is the quoted data, but no quotes. |
| 81 | // If we're in raw mode or there are no escapes or |
| 82 | // carriage returns, we're done. |
| 83 | var unquoteChars string |
| 84 | if raw { |
| 85 | unquoteChars = "\r" |
| 86 | } else { |
| 87 | unquoteChars = "\\\r" |
| 88 | } |
| 89 | if !strings.ContainsAny(quoted, unquoteChars) { |
| 90 | s = quoted |
| 91 | return |
| 92 | } |
| 93 | |
| 94 | // Otherwise process quoted string. |
| 95 | // Each iteration processes one escape sequence along with the |
| 96 | // plain text leading up to it. |
Josh Bleecher Snyder | 8cb25c8 | 2019-03-01 14:24:35 -0800 | [diff] [blame] | 97 | buf := new(strings.Builder) |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 98 | for { |
| 99 | // Remove prefix before escape sequence. |
| 100 | i := strings.IndexAny(quoted, unquoteChars) |
| 101 | if i < 0 { |
| 102 | i = len(quoted) |
| 103 | } |
| 104 | buf.WriteString(quoted[:i]) |
| 105 | quoted = quoted[i:] |
| 106 | |
| 107 | if len(quoted) == 0 { |
| 108 | break |
| 109 | } |
| 110 | |
| 111 | // Process carriage return. |
| 112 | if quoted[0] == '\r' { |
| 113 | buf.WriteByte('\n') |
| 114 | if len(quoted) > 1 && quoted[1] == '\n' { |
| 115 | quoted = quoted[2:] |
| 116 | } else { |
| 117 | quoted = quoted[1:] |
| 118 | } |
| 119 | continue |
| 120 | } |
| 121 | |
| 122 | // Process escape sequence. |
| 123 | if len(quoted) == 1 { |
| 124 | err = fmt.Errorf(`truncated escape sequence \`) |
| 125 | return |
| 126 | } |
| 127 | |
| 128 | switch quoted[1] { |
| 129 | default: |
alandonovan | 16e44b1 | 2020-03-26 10:23:16 -0400 | [diff] [blame] | 130 | // In Starlark, like Go, a backslash must escape something. |
| 131 | // (Python still treats unnecessary backslashes literally, |
| 132 | // but since 3.6 has emitted a deprecation warning.) |
| 133 | err = fmt.Errorf("invalid escape sequence \\%c", quoted[1]) |
| 134 | return |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 135 | |
| 136 | case '\n': |
| 137 | // Ignore the escape and the line break. |
| 138 | quoted = quoted[2:] |
| 139 | |
alandonovan | 2319aeb | 2020-06-15 13:21:36 -0400 | [diff] [blame] | 140 | case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"': |
alandonovan | 16e44b1 | 2020-03-26 10:23:16 -0400 | [diff] [blame] | 141 | // One-char escape. |
alandonovan | 2319aeb | 2020-06-15 13:21:36 -0400 | [diff] [blame] | 142 | // Escapes are allowed for both kinds of quotation |
| 143 | // mark, not just the kind in use. |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 144 | buf.WriteByte(unesc[quoted[1]]) |
| 145 | quoted = quoted[2:] |
| 146 | |
| 147 | case '0', '1', '2', '3', '4', '5', '6', '7': |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 148 | // Octal escape, up to 3 digits, \OOO. |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 149 | n := int(quoted[1] - '0') |
| 150 | quoted = quoted[2:] |
| 151 | for i := 1; i < 3; i++ { |
| 152 | if len(quoted) == 0 || quoted[0] < '0' || '7' < quoted[0] { |
| 153 | break |
| 154 | } |
| 155 | n = n*8 + int(quoted[0]-'0') |
| 156 | quoted = quoted[1:] |
| 157 | } |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 158 | if !isByte && n > 127 { |
| 159 | err = fmt.Errorf(`non-ASCII octal escape \%o (use \u%04X for the UTF-8 encoding of U+%04X)`, n, n, n) |
| 160 | return |
| 161 | } |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 162 | if n >= 256 { |
| 163 | // NOTE: Python silently discards the high bit, |
| 164 | // so that '\541' == '\141' == 'a'. |
| 165 | // Let's see if we can avoid doing that in BUILD files. |
| 166 | err = fmt.Errorf(`invalid escape sequence \%03o`, n) |
| 167 | return |
| 168 | } |
| 169 | buf.WriteByte(byte(n)) |
| 170 | |
| 171 | case 'x': |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 172 | // Hexadecimal escape, exactly 2 digits, \xXX. [0-127] |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 173 | if len(quoted) < 4 { |
| 174 | err = fmt.Errorf(`truncated escape sequence %s`, quoted) |
| 175 | return |
| 176 | } |
Josh Bleecher Snyder | 74f6cac | 2019-01-02 18:36:24 -1000 | [diff] [blame] | 177 | n, err1 := strconv.ParseUint(quoted[2:4], 16, 0) |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 178 | if err1 != nil { |
| 179 | err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4]) |
| 180 | return |
| 181 | } |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 182 | if !isByte && n > 127 { |
| 183 | err = fmt.Errorf(`non-ASCII hex escape %s (use \u%04X for the UTF-8 encoding of U+%04X)`, |
| 184 | quoted[:4], n, n) |
| 185 | return |
| 186 | } |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 187 | buf.WriteByte(byte(n)) |
| 188 | quoted = quoted[4:] |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 189 | |
| 190 | case 'u', 'U': |
| 191 | // Unicode code point, 4 (\uXXXX) or 8 (\UXXXXXXXX) hex digits. |
| 192 | sz := 6 |
| 193 | if quoted[1] == 'U' { |
| 194 | sz = 10 |
| 195 | } |
| 196 | if len(quoted) < sz { |
| 197 | err = fmt.Errorf(`truncated escape sequence %s`, quoted) |
| 198 | return |
| 199 | } |
| 200 | n, err1 := strconv.ParseUint(quoted[2:sz], 16, 0) |
| 201 | if err1 != nil { |
| 202 | err = fmt.Errorf(`invalid escape sequence %s`, quoted[:sz]) |
| 203 | return |
| 204 | } |
| 205 | if n > unicode.MaxRune { |
| 206 | err = fmt.Errorf(`code point out of range: %s (max \U%08x)`, |
| 207 | quoted[:sz], n) |
| 208 | return |
| 209 | } |
| 210 | // As in Go, surrogates are disallowed. |
| 211 | if 0xD800 <= n && n < 0xE000 { |
| 212 | err = fmt.Errorf(`invalid Unicode code point U+%04X`, n) |
| 213 | return |
| 214 | } |
| 215 | buf.WriteRune(rune(n)) |
| 216 | quoted = quoted[sz:] |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 217 | } |
| 218 | } |
| 219 | |
| 220 | s = buf.String() |
| 221 | return |
| 222 | } |
| 223 | |
| 224 | // indexByte returns the index of the first instance of b in s, or else -1. |
| 225 | func indexByte(s string, b byte) int { |
| 226 | for i := 0; i < len(s); i++ { |
| 227 | if s[i] == b { |
| 228 | return i |
| 229 | } |
| 230 | } |
| 231 | return -1 |
| 232 | } |
| 233 | |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 234 | // Quote returns a Starlark literal that denotes s. |
| 235 | // If b, it returns a bytes literal. |
| 236 | func Quote(s string, b bool) string { |
| 237 | const hex = "0123456789abcdef" |
| 238 | var runeTmp [utf8.UTFMax]byte |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 239 | |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 240 | buf := make([]byte, 0, 3*len(s)/2) |
| 241 | if b { |
| 242 | buf = append(buf, 'b') |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 243 | } |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 244 | buf = append(buf, '"') |
| 245 | for width := 0; len(s) > 0; s = s[width:] { |
| 246 | r := rune(s[0]) |
| 247 | width = 1 |
| 248 | if r >= utf8.RuneSelf { |
| 249 | r, width = utf8.DecodeRuneInString(s) |
| 250 | } |
| 251 | if width == 1 && r == utf8.RuneError { |
| 252 | // String (!b) literals accept \xXX escapes only for ASCII, |
| 253 | // but we must use them here to represent invalid bytes. |
| 254 | // The result is not a legal literal. |
| 255 | buf = append(buf, `\x`...) |
| 256 | buf = append(buf, hex[s[0]>>4]) |
| 257 | buf = append(buf, hex[s[0]&0xF]) |
| 258 | continue |
| 259 | } |
| 260 | if r == '"' || r == '\\' { // always backslashed |
| 261 | buf = append(buf, '\\') |
| 262 | buf = append(buf, byte(r)) |
| 263 | continue |
| 264 | } |
| 265 | if strconv.IsPrint(r) { |
| 266 | n := utf8.EncodeRune(runeTmp[:], r) |
| 267 | buf = append(buf, runeTmp[:n]...) |
| 268 | continue |
| 269 | } |
| 270 | switch r { |
| 271 | case '\a': |
| 272 | buf = append(buf, `\a`...) |
| 273 | case '\b': |
| 274 | buf = append(buf, `\b`...) |
| 275 | case '\f': |
| 276 | buf = append(buf, `\f`...) |
| 277 | case '\n': |
| 278 | buf = append(buf, `\n`...) |
| 279 | case '\r': |
| 280 | buf = append(buf, `\r`...) |
| 281 | case '\t': |
| 282 | buf = append(buf, `\t`...) |
| 283 | case '\v': |
| 284 | buf = append(buf, `\v`...) |
| 285 | default: |
| 286 | switch { |
| 287 | case r < ' ' || r == 0x7f: |
| 288 | buf = append(buf, `\x`...) |
| 289 | buf = append(buf, hex[byte(r)>>4]) |
| 290 | buf = append(buf, hex[byte(r)&0xF]) |
| 291 | case r > utf8.MaxRune: |
| 292 | r = 0xFFFD |
| 293 | fallthrough |
| 294 | case r < 0x10000: |
| 295 | buf = append(buf, `\u`...) |
| 296 | for s := 12; s >= 0; s -= 4 { |
| 297 | buf = append(buf, hex[r>>uint(s)&0xF]) |
| 298 | } |
| 299 | default: |
| 300 | buf = append(buf, `\U`...) |
| 301 | for s := 28; s >= 0; s -= 4 { |
| 302 | buf = append(buf, hex[r>>uint(s)&0xF]) |
| 303 | } |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 304 | } |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 305 | } |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 306 | } |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 307 | buf = append(buf, '"') |
| 308 | return string(buf) |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 309 | } |