syntax/quote.go - platform/external/starlark-go - Gitiles

 // Copyright 2017 The Bazel Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package syntax

 // Starlark quoted string utilities.

 import (
 	"fmt"
 	"strconv"
 	"strings"
 	"unicode"
 	"unicode/utf8"
 )

 // unesc maps single-letter chars following \ to their actual values.
 var unesc = [256]byte{
 	'a':  '\a',
 	'b':  '\b',
 	'f':  '\f',
 	'n':  '\n',
 	'r':  '\r',
 	't':  '\t',
 	'v':  '\v',
 	'\\': '\\',
 	'\'': '\'',
 	'"':  '"',
 }

 // esc maps escape-worthy bytes to the char that should follow \.
 var esc = [256]byte{
 	'\a': 'a',
 	'\b': 'b',
 	'\f': 'f',
 	'\n': 'n',
 	'\r': 'r',
 	'\t': 't',
 	'\v': 'v',
 	'\\': '\\',
 	'\'': '\'',
 	'"':  '"',
 }

 // unquote unquotes the quoted string, returning the actual
 // string value, whether the original was triple-quoted,
 // whether it was a byte string, and an error describing invalid input.
 func unquote(quoted string) (s string, triple, isByte bool, err error) {
 	// Check for raw prefix: means don't interpret the inner \.
 	raw := false
 	if strings.HasPrefix(quoted, "r") {
 		raw = true
 		quoted = quoted[1:]
 	}
 	// Check for bytes prefix.
 	if strings.HasPrefix(quoted, "b") {
 		isByte = true
 		quoted = quoted[1:]
 	}

 	if len(quoted) < 2 {
 		err = fmt.Errorf("string literal too short")
 		return
 	}

 	if quoted[0] != '"' && quoted[0] != '\'' || quoted[0] != quoted[len(quoted)-1] {
 		err = fmt.Errorf("string literal has invalid quotes")
 		return
 	}

 	// Check for triple quoted string.
 	quote := quoted[0]
 	if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] {
 		triple = true
 		quoted = quoted[3 : len(quoted)-3]
 	} else {
 		quoted = quoted[1 : len(quoted)-1]
 	}

 	// Now quoted is the quoted data, but no quotes.
 	// If we're in raw mode or there are no escapes or
 	// carriage returns, we're done.
 	var unquoteChars string
 	if raw {
 		unquoteChars = "\r"
 	} else {
 		unquoteChars = "\\\r"
 	}
 	if !strings.ContainsAny(quoted, unquoteChars) {
 		s = quoted
 		return
 	}

 	// Otherwise process quoted string.
 	// Each iteration processes one escape sequence along with the
 	// plain text leading up to it.
 	buf := new(strings.Builder)
 	for {
 		// Remove prefix before escape sequence.
 		i := strings.IndexAny(quoted, unquoteChars)
 		if i < 0 {
 			i = len(quoted)
 		}
 		buf.WriteString(quoted[:i])
 		quoted = quoted[i:]

 		if len(quoted) == 0 {
 			break
 		}

 		// Process carriage return.
 		if quoted[0] == '\r' {
 			buf.WriteByte('\n')
 			if len(quoted) > 1 && quoted[1] == '\n' {
 				quoted = quoted[2:]
 			} else {
 				quoted = quoted[1:]
 			}
 			continue
 		}

 		// Process escape sequence.
 		if len(quoted) == 1 {
 			err = fmt.Errorf(`truncated escape sequence \`)
 			return
 		}

 		switch quoted[1] {
 		default:
 			// In Starlark, like Go, a backslash must escape something.
 			// (Python still treats unnecessary backslashes literally,
 			// but since 3.6 has emitted a deprecation warning.)
 			err = fmt.Errorf("invalid escape sequence \\%c", quoted[1])
 			return

 		case '\n':
 			// Ignore the escape and the line break.
 			quoted = quoted[2:]

 		case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"':
 			// One-char escape.
 			// Escapes are allowed for both kinds of quotation
 			// mark, not just the kind in use.
 			buf.WriteByte(unesc[quoted[1]])
 			quoted = quoted[2:]

 		case '0', '1', '2', '3', '4', '5', '6', '7':
 			// Octal escape, up to 3 digits, \OOO.
 			n := int(quoted[1] - '0')
 			quoted = quoted[2:]
 			for i := 1; i < 3; i++ {
 				if len(quoted) == 0 || quoted[0] < '0' || '7' < quoted[0] {
 					break
 				}
 				n = n*8 + int(quoted[0]-'0')
 				quoted = quoted[1:]
 			}
 			if !isByte && n > 127 {
 				err = fmt.Errorf(`non-ASCII octal escape \%o (use \u%04X for the UTF-8 encoding of U+%04X)`, n, n, n)
 				return
 			}
 			if n >= 256 {
 				// NOTE: Python silently discards the high bit,
 				// so that '\541' == '\141' == 'a'.
 				// Let's see if we can avoid doing that in BUILD files.
 				err = fmt.Errorf(`invalid escape sequence \%03o`, n)
 				return
 			}
 			buf.WriteByte(byte(n))

 		case 'x':
 			// Hexadecimal escape, exactly 2 digits, \xXX. [0-127]
 			if len(quoted) < 4 {
 				err = fmt.Errorf(`truncated escape sequence %s`, quoted)
 				return
 			}
 			n, err1 := strconv.ParseUint(quoted[2:4], 16, 0)
 			if err1 != nil {
 				err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4])
 				return
 			}
 			if !isByte && n > 127 {
 				err = fmt.Errorf(`non-ASCII hex escape %s (use \u%04X for the UTF-8 encoding of U+%04X)`,
 					quoted[:4], n, n)
 				return
 			}
 			buf.WriteByte(byte(n))
 			quoted = quoted[4:]

 		case 'u', 'U':
 			// Unicode code point, 4 (\uXXXX) or 8 (\UXXXXXXXX) hex digits.
 			sz := 6
 			if quoted[1] == 'U' {
 				sz = 10
 			}
 			if len(quoted) < sz {
 				err = fmt.Errorf(`truncated escape sequence %s`, quoted)
 				return
 			}
 			n, err1 := strconv.ParseUint(quoted[2:sz], 16, 0)
 			if err1 != nil {
 				err = fmt.Errorf(`invalid escape sequence %s`, quoted[:sz])
 				return
 			}
 			if n > unicode.MaxRune {
 				err = fmt.Errorf(`code point out of range: %s (max \U%08x)`,
 					quoted[:sz], n)
 				return
 			}
 			// As in Go, surrogates are disallowed.
 			if 0xD800 <= n && n < 0xE000 {
 				err = fmt.Errorf(`invalid Unicode code point U+%04X`, n)
 				return
 			}
 			buf.WriteRune(rune(n))
 			quoted = quoted[sz:]
 		}
 	}

 	s = buf.String()
 	return
 }

 // indexByte returns the index of the first instance of b in s, or else -1.
 func indexByte(s string, b byte) int {
 	for i := 0; i < len(s); i++ {
 		if s[i] == b {
 			return i
 		}
 	}
 	return -1
 }

 // Quote returns a Starlark literal that denotes s.
 // If b, it returns a bytes literal.
 func Quote(s string, b bool) string {
 	const hex = "0123456789abcdef"
 	var runeTmp [utf8.UTFMax]byte

 	buf := make([]byte, 0, 3*len(s)/2)
 	if b {
 		buf = append(buf, 'b')
 	}
 	buf = append(buf, '"')
 	for width := 0; len(s) > 0; s = s[width:] {
 		r := rune(s[0])
 		width = 1
 		if r >= utf8.RuneSelf {
 			r, width = utf8.DecodeRuneInString(s)
 		}
 		if width == 1 && r == utf8.RuneError {
 			// String (!b) literals accept \xXX escapes only for ASCII,
 			// but we must use them here to represent invalid bytes.
 			// The result is not a legal literal.
 			buf = append(buf, `\x`...)
 			buf = append(buf, hex[s[0]>>4])
 			buf = append(buf, hex[s[0]&0xF])
 			continue
 		}
 		if r == '"' || r == '\\' { // always backslashed
 			buf = append(buf, '\\')
 			buf = append(buf, byte(r))
 			continue
 		}
 		if strconv.IsPrint(r) {
 			n := utf8.EncodeRune(runeTmp[:], r)
 			buf = append(buf, runeTmp[:n]...)
 			continue
 		}
 		switch r {
 		case '\a':
 			buf = append(buf, `\a`...)
 		case '\b':
 			buf = append(buf, `\b`...)
 		case '\f':
 			buf = append(buf, `\f`...)
 		case '\n':
 			buf = append(buf, `\n`...)
 		case '\r':
 			buf = append(buf, `\r`...)
 		case '\t':
 			buf = append(buf, `\t`...)
 		case '\v':
 			buf = append(buf, `\v`...)
 		default:
 			switch {
 			case r < ' ' || r == 0x7f:
 				buf = append(buf, `\x`...)
 				buf = append(buf, hex[byte(r)>>4])
 				buf = append(buf, hex[byte(r)&0xF])
 			case r > utf8.MaxRune:
 				r = 0xFFFD
 				fallthrough
 			case r < 0x10000:
 				buf = append(buf, `\u`...)
 				for s := 12; s >= 0; s -= 4 {
 					buf = append(buf, hex[r>>uint(s)&0xF])
 				}
 			default:
 				buf = append(buf, `\U`...)
 				for s := 28; s >= 0; s -= 4 {
 					buf = append(buf, hex[r>>uint(s)&0xF])
 				}
 			}
 		}
 	}
 	buf = append(buf, '"')
 	return string(buf)
 }
	// Copyright 2017 The Bazel Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package syntax

	// Starlark quoted string utilities.

	import (
	"fmt"
	"strconv"
	"strings"
	"unicode"
	"unicode/utf8"
	)

	// unesc maps single-letter chars following \ to their actual values.
	var unesc = [256]byte{
	'a': '\a',
	'b': '\b',
	'f': '\f',
	'n': '\n',
	'r': '\r',
	't': '\t',
	'v': '\v',
	'\\': '\\',
	'\'': '\'',
	'"': '"',
	}

	// esc maps escape-worthy bytes to the char that should follow \.
	var esc = [256]byte{
	'\a': 'a',
	'\b': 'b',
	'\f': 'f',
	'\n': 'n',
	'\r': 'r',
	'\t': 't',
	'\v': 'v',
	'\\': '\\',
	'\'': '\'',
	'"': '"',
	}

	// unquote unquotes the quoted string, returning the actual
	// string value, whether the original was triple-quoted,
	// whether it was a byte string, and an error describing invalid input.
	func unquote(quoted string) (s string, triple, isByte bool, err error) {
	// Check for raw prefix: means don't interpret the inner \.
	raw := false
	if strings.HasPrefix(quoted, "r") {
	raw = true
	quoted = quoted[1:]
	}
	// Check for bytes prefix.
	if strings.HasPrefix(quoted, "b") {
	isByte = true
	quoted = quoted[1:]
	}

	if len(quoted) < 2 {
	err = fmt.Errorf("string literal too short")
	return
	}

	if quoted[0] != '"' && quoted[0] != '\'' \|\| quoted[0] != quoted[len(quoted)-1] {
	err = fmt.Errorf("string literal has invalid quotes")
	return
	}

	// Check for triple quoted string.
	quote := quoted[0]
	if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] {
	triple = true
	quoted = quoted[3 : len(quoted)-3]
	} else {
	quoted = quoted[1 : len(quoted)-1]
	}

	// Now quoted is the quoted data, but no quotes.
	// If we're in raw mode or there are no escapes or
	// carriage returns, we're done.
	var unquoteChars string
	if raw {
	unquoteChars = "\r"
	} else {
	unquoteChars = "\\\r"
	}
	if !strings.ContainsAny(quoted, unquoteChars) {
	s = quoted
	return
	}

	// Otherwise process quoted string.
	// Each iteration processes one escape sequence along with the
	// plain text leading up to it.
	buf := new(strings.Builder)
	for {
	// Remove prefix before escape sequence.
	i := strings.IndexAny(quoted, unquoteChars)
	if i < 0 {
	i = len(quoted)
	}
	buf.WriteString(quoted[:i])
	quoted = quoted[i:]

	if len(quoted) == 0 {
	break
	}

	// Process carriage return.
	if quoted[0] == '\r' {
	buf.WriteByte('\n')
	if len(quoted) > 1 && quoted[1] == '\n' {
	quoted = quoted[2:]
	} else {
	quoted = quoted[1:]
	}
	continue
	}

	// Process escape sequence.
	if len(quoted) == 1 {
	err = fmt.Errorf(`truncated escape sequence \`)
	return
	}

	switch quoted[1] {
	default:
	// In Starlark, like Go, a backslash must escape something.
	// (Python still treats unnecessary backslashes literally,
	// but since 3.6 has emitted a deprecation warning.)
	err = fmt.Errorf("invalid escape sequence \\%c", quoted[1])
	return

	case '\n':
	// Ignore the escape and the line break.
	quoted = quoted[2:]

	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"':
	// One-char escape.
	// Escapes are allowed for both kinds of quotation
	// mark, not just the kind in use.
	buf.WriteByte(unesc[quoted[1]])
	quoted = quoted[2:]

	case '0', '1', '2', '3', '4', '5', '6', '7':
	// Octal escape, up to 3 digits, \OOO.
	n := int(quoted[1] - '0')
	quoted = quoted[2:]
	for i := 1; i < 3; i++ {
	if len(quoted) == 0 \|\| quoted[0] < '0' \|\| '7' < quoted[0] {
	break
	}
	n = n*8 + int(quoted[0]-'0')
	quoted = quoted[1:]
	}
	if !isByte && n > 127 {
	err = fmt.Errorf(`non-ASCII octal escape \%o (use \u%04X for the UTF-8 encoding of U+%04X)`, n, n, n)
	return
	}
	if n >= 256 {
	// NOTE: Python silently discards the high bit,
	// so that '\541' == '\141' == 'a'.
	// Let's see if we can avoid doing that in BUILD files.
	err = fmt.Errorf(`invalid escape sequence \%03o`, n)
	return
	}
	buf.WriteByte(byte(n))

	case 'x':
	// Hexadecimal escape, exactly 2 digits, \xXX. [0-127]
	if len(quoted) < 4 {
	err = fmt.Errorf(`truncated escape sequence %s`, quoted)
	return
	}
	n, err1 := strconv.ParseUint(quoted[2:4], 16, 0)
	if err1 != nil {
	err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4])
	return
	}
	if !isByte && n > 127 {
	err = fmt.Errorf(`non-ASCII hex escape %s (use \u%04X for the UTF-8 encoding of U+%04X)`,
	quoted[:4], n, n)
	return
	}
	buf.WriteByte(byte(n))
	quoted = quoted[4:]

	case 'u', 'U':
	// Unicode code point, 4 (\uXXXX) or 8 (\UXXXXXXXX) hex digits.
	sz := 6
	if quoted[1] == 'U' {
	sz = 10
	}
	if len(quoted) < sz {
	err = fmt.Errorf(`truncated escape sequence %s`, quoted)
	return
	}
	n, err1 := strconv.ParseUint(quoted[2:sz], 16, 0)
	if err1 != nil {
	err = fmt.Errorf(`invalid escape sequence %s`, quoted[:sz])
	return
	}
	if n > unicode.MaxRune {
	err = fmt.Errorf(`code point out of range: %s (max \U%08x)`,
	quoted[:sz], n)
	return
	}
	// As in Go, surrogates are disallowed.
	if 0xD800 <= n && n < 0xE000 {
	err = fmt.Errorf(`invalid Unicode code point U+%04X`, n)
	return
	}
	buf.WriteRune(rune(n))
	quoted = quoted[sz:]
	}
	}

	s = buf.String()
	return
	}

	// indexByte returns the index of the first instance of b in s, or else -1.
	func indexByte(s string, b byte) int {
	for i := 0; i < len(s); i++ {
	if s[i] == b {
	return i
	}
	}
	return -1
	}

	// Quote returns a Starlark literal that denotes s.
	// If b, it returns a bytes literal.
	func Quote(s string, b bool) string {
	const hex = "0123456789abcdef"
	var runeTmp [utf8.UTFMax]byte

	buf := make([]byte, 0, 3*len(s)/2)
	if b {
	buf = append(buf, 'b')
	}
	buf = append(buf, '"')
	for width := 0; len(s) > 0; s = s[width:] {
	r := rune(s[0])
	width = 1
	if r >= utf8.RuneSelf {
	r, width = utf8.DecodeRuneInString(s)
	}
	if width == 1 && r == utf8.RuneError {
	// String (!b) literals accept \xXX escapes only for ASCII,
	// but we must use them here to represent invalid bytes.
	// The result is not a legal literal.
	buf = append(buf, `\x`...)
	buf = append(buf, hex[s[0]>>4])
	buf = append(buf, hex[s[0]&0xF])
	continue
	}
	if r == '"' \|\| r == '\\' { // always backslashed
	buf = append(buf, '\\')
	buf = append(buf, byte(r))
	continue
	}
	if strconv.IsPrint(r) {
	n := utf8.EncodeRune(runeTmp[:], r)
	buf = append(buf, runeTmp[:n]...)
	continue
	}
	switch r {
	case '\a':
	buf = append(buf, `\a`...)
	case '\b':
	buf = append(buf, `\b`...)
	case '\f':
	buf = append(buf, `\f`...)
	case '\n':
	buf = append(buf, `\n`...)
	case '\r':
	buf = append(buf, `\r`...)
	case '\t':
	buf = append(buf, `\t`...)
	case '\v':
	buf = append(buf, `\v`...)
	default:
	switch {
	case r < ' ' \|\| r == 0x7f:
	buf = append(buf, `\x`...)
	buf = append(buf, hex[byte(r)>>4])
	buf = append(buf, hex[byte(r)&0xF])
	case r > utf8.MaxRune:
	r = 0xFFFD
	fallthrough
	case r < 0x10000:
	buf = append(buf, `\u`...)
	for s := 12; s >= 0; s -= 4 {
	buf = append(buf, hex[r>>uint(s)&0xF])
	}
	default:
	buf = append(buf, `\U`...)
	for s := 28; s >= 0; s -= 4 {
	buf = append(buf, hex[r>>uint(s)&0xF])
	}
	}
	}
	}
	buf = append(buf, '"')
	return string(buf)
	}