blob: 741e106ad7dfbc485b141d5e64dce104a5e013b1 [file] [log] [blame]
// Copyright 2017 The Bazel Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
// Starlark quoted string utilities.
import (
"fmt"
"strconv"
"strings"
"unicode"
"unicode/utf8"
)
// unesc maps single-letter chars following \ to their actual values.
var unesc = [256]byte{
'a': '\a',
'b': '\b',
'f': '\f',
'n': '\n',
'r': '\r',
't': '\t',
'v': '\v',
'\\': '\\',
'\'': '\'',
'"': '"',
}
// esc maps escape-worthy bytes to the char that should follow \.
var esc = [256]byte{
'\a': 'a',
'\b': 'b',
'\f': 'f',
'\n': 'n',
'\r': 'r',
'\t': 't',
'\v': 'v',
'\\': '\\',
'\'': '\'',
'"': '"',
}
// unquote unquotes the quoted string, returning the actual
// string value, whether the original was triple-quoted,
// whether it was a byte string, and an error describing invalid input.
func unquote(quoted string) (s string, triple, isByte bool, err error) {
// Check for raw prefix: means don't interpret the inner \.
raw := false
if strings.HasPrefix(quoted, "r") {
raw = true
quoted = quoted[1:]
}
// Check for bytes prefix.
if strings.HasPrefix(quoted, "b") {
isByte = true
quoted = quoted[1:]
}
if len(quoted) < 2 {
err = fmt.Errorf("string literal too short")
return
}
if quoted[0] != '"' && quoted[0] != '\'' || quoted[0] != quoted[len(quoted)-1] {
err = fmt.Errorf("string literal has invalid quotes")
return
}
// Check for triple quoted string.
quote := quoted[0]
if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] {
triple = true
quoted = quoted[3 : len(quoted)-3]
} else {
quoted = quoted[1 : len(quoted)-1]
}
// Now quoted is the quoted data, but no quotes.
// If we're in raw mode or there are no escapes or
// carriage returns, we're done.
var unquoteChars string
if raw {
unquoteChars = "\r"
} else {
unquoteChars = "\\\r"
}
if !strings.ContainsAny(quoted, unquoteChars) {
s = quoted
return
}
// Otherwise process quoted string.
// Each iteration processes one escape sequence along with the
// plain text leading up to it.
buf := new(strings.Builder)
for {
// Remove prefix before escape sequence.
i := strings.IndexAny(quoted, unquoteChars)
if i < 0 {
i = len(quoted)
}
buf.WriteString(quoted[:i])
quoted = quoted[i:]
if len(quoted) == 0 {
break
}
// Process carriage return.
if quoted[0] == '\r' {
buf.WriteByte('\n')
if len(quoted) > 1 && quoted[1] == '\n' {
quoted = quoted[2:]
} else {
quoted = quoted[1:]
}
continue
}
// Process escape sequence.
if len(quoted) == 1 {
err = fmt.Errorf(`truncated escape sequence \`)
return
}
switch quoted[1] {
default:
// In Starlark, like Go, a backslash must escape something.
// (Python still treats unnecessary backslashes literally,
// but since 3.6 has emitted a deprecation warning.)
err = fmt.Errorf("invalid escape sequence \\%c", quoted[1])
return
case '\n':
// Ignore the escape and the line break.
quoted = quoted[2:]
case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"':
// One-char escape.
// Escapes are allowed for both kinds of quotation
// mark, not just the kind in use.
buf.WriteByte(unesc[quoted[1]])
quoted = quoted[2:]
case '0', '1', '2', '3', '4', '5', '6', '7':
// Octal escape, up to 3 digits, \OOO.
n := int(quoted[1] - '0')
quoted = quoted[2:]
for i := 1; i < 3; i++ {
if len(quoted) == 0 || quoted[0] < '0' || '7' < quoted[0] {
break
}
n = n*8 + int(quoted[0]-'0')
quoted = quoted[1:]
}
if !isByte && n > 127 {
err = fmt.Errorf(`non-ASCII octal escape \%o (use \u%04X for the UTF-8 encoding of U+%04X)`, n, n, n)
return
}
if n >= 256 {
// NOTE: Python silently discards the high bit,
// so that '\541' == '\141' == 'a'.
// Let's see if we can avoid doing that in BUILD files.
err = fmt.Errorf(`invalid escape sequence \%03o`, n)
return
}
buf.WriteByte(byte(n))
case 'x':
// Hexadecimal escape, exactly 2 digits, \xXX. [0-127]
if len(quoted) < 4 {
err = fmt.Errorf(`truncated escape sequence %s`, quoted)
return
}
n, err1 := strconv.ParseUint(quoted[2:4], 16, 0)
if err1 != nil {
err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4])
return
}
if !isByte && n > 127 {
err = fmt.Errorf(`non-ASCII hex escape %s (use \u%04X for the UTF-8 encoding of U+%04X)`,
quoted[:4], n, n)
return
}
buf.WriteByte(byte(n))
quoted = quoted[4:]
case 'u', 'U':
// Unicode code point, 4 (\uXXXX) or 8 (\UXXXXXXXX) hex digits.
sz := 6
if quoted[1] == 'U' {
sz = 10
}
if len(quoted) < sz {
err = fmt.Errorf(`truncated escape sequence %s`, quoted)
return
}
n, err1 := strconv.ParseUint(quoted[2:sz], 16, 0)
if err1 != nil {
err = fmt.Errorf(`invalid escape sequence %s`, quoted[:sz])
return
}
if n > unicode.MaxRune {
err = fmt.Errorf(`code point out of range: %s (max \U%08x)`,
quoted[:sz], n)
return
}
// As in Go, surrogates are disallowed.
if 0xD800 <= n && n < 0xE000 {
err = fmt.Errorf(`invalid Unicode code point U+%04X`, n)
return
}
buf.WriteRune(rune(n))
quoted = quoted[sz:]
}
}
s = buf.String()
return
}
// indexByte returns the index of the first instance of b in s, or else -1.
func indexByte(s string, b byte) int {
for i := 0; i < len(s); i++ {
if s[i] == b {
return i
}
}
return -1
}
// Quote returns a Starlark literal that denotes s.
// If b, it returns a bytes literal.
func Quote(s string, b bool) string {
const hex = "0123456789abcdef"
var runeTmp [utf8.UTFMax]byte
buf := make([]byte, 0, 3*len(s)/2)
if b {
buf = append(buf, 'b')
}
buf = append(buf, '"')
for width := 0; len(s) > 0; s = s[width:] {
r := rune(s[0])
width = 1
if r >= utf8.RuneSelf {
r, width = utf8.DecodeRuneInString(s)
}
if width == 1 && r == utf8.RuneError {
// String (!b) literals accept \xXX escapes only for ASCII,
// but we must use them here to represent invalid bytes.
// The result is not a legal literal.
buf = append(buf, `\x`...)
buf = append(buf, hex[s[0]>>4])
buf = append(buf, hex[s[0]&0xF])
continue
}
if r == '"' || r == '\\' { // always backslashed
buf = append(buf, '\\')
buf = append(buf, byte(r))
continue
}
if strconv.IsPrint(r) {
n := utf8.EncodeRune(runeTmp[:], r)
buf = append(buf, runeTmp[:n]...)
continue
}
switch r {
case '\a':
buf = append(buf, `\a`...)
case '\b':
buf = append(buf, `\b`...)
case '\f':
buf = append(buf, `\f`...)
case '\n':
buf = append(buf, `\n`...)
case '\r':
buf = append(buf, `\r`...)
case '\t':
buf = append(buf, `\t`...)
case '\v':
buf = append(buf, `\v`...)
default:
switch {
case r < ' ' || r == 0x7f:
buf = append(buf, `\x`...)
buf = append(buf, hex[byte(r)>>4])
buf = append(buf, hex[byte(r)&0xF])
case r > utf8.MaxRune:
r = 0xFFFD
fallthrough
case r < 0x10000:
buf = append(buf, `\u`...)
for s := 12; s >= 0; s -= 4 {
buf = append(buf, hex[r>>uint(s)&0xF])
}
default:
buf = append(buf, `\U`...)
for s := 28; s >= 0; s -= 4 {
buf = append(buf, hex[r>>uint(s)&0xF])
}
}
}
}
buf = append(buf, '"')
return string(buf)
}