Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 1 | // Copyright 2017 The Bazel Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package syntax |
| 6 | |
| 7 | import ( |
| 8 | "bytes" |
| 9 | "fmt" |
| 10 | "go/build" |
| 11 | "io/ioutil" |
| 12 | "path/filepath" |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 13 | "strings" |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 14 | "testing" |
| 15 | ) |
| 16 | |
| 17 | func scan(src interface{}) (tokens string, err error) { |
Alan Donovan | e3deafe | 2018-10-23 11:05:09 -0400 | [diff] [blame] | 18 | sc, err := newScanner("foo.star", src, false) |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 19 | if err != nil { |
| 20 | return "", err |
| 21 | } |
| 22 | |
| 23 | defer sc.recover(&err) |
| 24 | |
| 25 | var buf bytes.Buffer |
| 26 | var val tokenValue |
| 27 | for { |
| 28 | tok := sc.nextToken(&val) |
| 29 | |
| 30 | if buf.Len() > 0 { |
| 31 | buf.WriteByte(' ') |
| 32 | } |
| 33 | switch tok { |
| 34 | case EOF: |
| 35 | buf.WriteString("EOF") |
| 36 | case IDENT: |
| 37 | buf.WriteString(val.raw) |
| 38 | case INT: |
Mohamed Elqdusy | 69e9615 | 2018-01-22 20:00:29 +0100 | [diff] [blame] | 39 | if val.bigInt != nil { |
| 40 | fmt.Fprintf(&buf, "%d", val.bigInt) |
| 41 | } else { |
| 42 | fmt.Fprintf(&buf, "%d", val.int) |
| 43 | } |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 44 | case FLOAT: |
| 45 | fmt.Fprintf(&buf, "%e", val.float) |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 46 | case STRING, BYTES: |
| 47 | buf.WriteString(Quote(val.string, tok == BYTES)) |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 48 | default: |
| 49 | buf.WriteString(tok.String()) |
| 50 | } |
| 51 | if tok == EOF { |
| 52 | break |
| 53 | } |
| 54 | } |
| 55 | return buf.String(), nil |
| 56 | } |
| 57 | |
| 58 | func TestScanner(t *testing.T) { |
| 59 | for _, test := range []struct { |
| 60 | input, want string |
| 61 | }{ |
| 62 | {``, "EOF"}, |
| 63 | {`123`, "123 EOF"}, |
| 64 | {`x.y`, "x . y EOF"}, |
| 65 | {`chocolate.éclair`, `chocolate . éclair EOF`}, |
| 66 | {`123 "foo" hello x.y`, `123 "foo" hello x . y EOF`}, |
| 67 | {`print(x)`, "print ( x ) EOF"}, |
| 68 | {`print(x); print(y)`, "print ( x ) ; print ( y ) EOF"}, |
Alan Donovan | ae06384 | 2017-10-10 15:46:17 -0400 | [diff] [blame] | 69 | {"\nprint(\n1\n)\n", "print ( 1 ) newline EOF"}, // final \n is at toplevel on non-blank line => token |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 70 | {`/ // /= //= ///=`, "/ // /= //= // /= EOF"}, |
| 71 | {`# hello |
| 72 | print(x)`, "print ( x ) EOF"}, |
| 73 | {`# hello |
| 74 | print(1) |
| 75 | cc_binary(name="foo") |
| 76 | def f(x): |
| 77 | return x+1 |
| 78 | print(1) |
| 79 | `, |
| 80 | `print ( 1 ) newline ` + |
| 81 | `cc_binary ( name = "foo" ) newline ` + |
| 82 | `def f ( x ) : newline ` + |
| 83 | `indent return x + 1 newline ` + |
| 84 | `outdent print ( 1 ) newline ` + |
| 85 | `EOF`}, |
| 86 | // EOF should act line an implicit newline. |
| 87 | {`def f(): pass`, |
| 88 | "def f ( ) : pass EOF"}, |
| 89 | {`def f(): |
| 90 | pass`, |
| 91 | "def f ( ) : newline indent pass newline outdent EOF"}, |
| 92 | {`def f(): |
| 93 | pass |
| 94 | # oops`, |
| 95 | "def f ( ) : newline indent pass newline outdent EOF"}, |
| 96 | {`def f(): |
| 97 | pass \ |
| 98 | `, |
| 99 | "def f ( ) : newline indent pass newline outdent EOF"}, |
| 100 | {`def f(): |
| 101 | pass |
| 102 | `, |
| 103 | "def f ( ) : newline indent pass newline outdent EOF"}, |
| 104 | {`pass |
| 105 | |
| 106 | |
| 107 | pass`, "pass newline pass EOF"}, // consecutive newlines are consolidated |
| 108 | {`def f(): |
| 109 | pass |
| 110 | `, "def f ( ) : newline indent pass newline outdent EOF"}, |
| 111 | {`def f(): |
| 112 | pass |
| 113 | ` + "\n", "def f ( ) : newline indent pass newline outdent EOF"}, |
| 114 | {"pass", "pass EOF"}, |
| 115 | {"pass\n", "pass newline EOF"}, |
| 116 | {"pass\n ", "pass newline EOF"}, |
| 117 | {"pass\n \n", "pass newline EOF"}, |
| 118 | {"if x:\n pass\n ", "if x : newline indent pass newline outdent EOF"}, |
| 119 | {`x = 1 + \ |
| 120 | 2`, `x = 1 + 2 EOF`}, |
| 121 | {`x = 'a\nb'`, `x = "a\nb" EOF`}, |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 122 | {`x = r'a\nb'`, `x = "a\\nb" EOF`}, |
alandonovan | e8819e8 | 2020-03-26 17:56:36 -0400 | [diff] [blame] | 123 | {"x = 'a\\\nb'", `x = "ab" EOF`}, |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 124 | {`x = '\''`, `x = "'" EOF`}, |
| 125 | {`x = "\""`, `x = "\"" EOF`}, |
| 126 | {`x = r'\''`, `x = "\\'" EOF`}, |
| 127 | {`x = '''\''''`, `x = "'" EOF`}, |
| 128 | {`x = r'''\''''`, `x = "\\'" EOF`}, |
| 129 | {`x = ''''a'b'c'''`, `x = "'a'b'c" EOF`}, |
| 130 | {"x = '''a\nb'''", `x = "a\nb" EOF`}, |
| 131 | {"x = '''a\rb'''", `x = "a\nb" EOF`}, |
| 132 | {"x = '''a\r\nb'''", `x = "a\nb" EOF`}, |
| 133 | {"x = '''a\n\rb'''", `x = "a\n\nb" EOF`}, |
| 134 | {"x = r'a\\\nb'", `x = "a\\\nb" EOF`}, |
| 135 | {"x = r'a\\\rb'", `x = "a\\\nb" EOF`}, |
| 136 | {"x = r'a\\\r\nb'", `x = "a\\\nb" EOF`}, |
| 137 | {"a\rb", `a newline b EOF`}, |
| 138 | {"a\nb", `a newline b EOF`}, |
| 139 | {"a\r\nb", `a newline b EOF`}, |
| 140 | {"a\n\nb", `a newline b EOF`}, |
| 141 | // numbers |
| 142 | {"0", `0 EOF`}, |
| 143 | {"00", `0 EOF`}, |
| 144 | {"0.", `0.000000e+00 EOF`}, |
| 145 | {"0.e1", `0.000000e+00 EOF`}, |
| 146 | {".0", `0.000000e+00 EOF`}, |
| 147 | {"0.0", `0.000000e+00 EOF`}, |
| 148 | {".e1", `. e1 EOF`}, |
| 149 | {"1", `1 EOF`}, |
| 150 | {"1.", `1.000000e+00 EOF`}, |
| 151 | {".1", `1.000000e-01 EOF`}, |
| 152 | {".1e1", `1.000000e+00 EOF`}, |
| 153 | {".1e+1", `1.000000e+00 EOF`}, |
| 154 | {".1e-1", `1.000000e-02 EOF`}, |
| 155 | {"1e1", `1.000000e+01 EOF`}, |
| 156 | {"1e+1", `1.000000e+01 EOF`}, |
| 157 | {"1e-1", `1.000000e-01 EOF`}, |
| 158 | {"123", `123 EOF`}, |
| 159 | {"123e45", `1.230000e+47 EOF`}, |
Mohamed Elqdusy | 69e9615 | 2018-01-22 20:00:29 +0100 | [diff] [blame] | 160 | {"999999999999999999999999999999999999999999999999999", `999999999999999999999999999999999999999999999999999 EOF`}, |
| 161 | {"12345678901234567890", `12345678901234567890 EOF`}, |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 162 | // hex |
| 163 | {"0xA", `10 EOF`}, |
| 164 | {"0xAAG", `170 G EOF`}, |
Alan Donovan | e3deafe | 2018-10-23 11:05:09 -0400 | [diff] [blame] | 165 | {"0xG", `foo.star:1:1: invalid hex literal`}, |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 166 | {"0XA", `10 EOF`}, |
Alan Donovan | e3deafe | 2018-10-23 11:05:09 -0400 | [diff] [blame] | 167 | {"0XG", `foo.star:1:1: invalid hex literal`}, |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 168 | {"0xA.", `10 . EOF`}, |
| 169 | {"0xA.e1", `10 . e1 EOF`}, |
Mohamed Elqdusy | 69e9615 | 2018-01-22 20:00:29 +0100 | [diff] [blame] | 170 | {"0x12345678deadbeef12345678", `5634002672576678570168178296 EOF`}, |
Mohamed Elqdusy | 3b32df9 | 2018-01-08 17:20:46 +0100 | [diff] [blame] | 171 | // binary |
| 172 | {"0b1010", `10 EOF`}, |
| 173 | {"0B111101", `61 EOF`}, |
Alan Donovan | e3deafe | 2018-10-23 11:05:09 -0400 | [diff] [blame] | 174 | {"0b3", `foo.star:1:3: invalid binary literal`}, |
Mohamed Elqdusy | 3b32df9 | 2018-01-08 17:20:46 +0100 | [diff] [blame] | 175 | {"0b1010201", `10 201 EOF`}, |
| 176 | {"0b1010.01", `10 1.000000e-02 EOF`}, |
| 177 | {"0b0000", `0 EOF`}, |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 178 | // octal |
| 179 | {"0o123", `83 EOF`}, |
| 180 | {"0o12834", `10 834 EOF`}, |
| 181 | {"0o12934", `10 934 EOF`}, |
| 182 | {"0o12934.", `10 9.340000e+02 EOF`}, |
| 183 | {"0o12934.1", `10 9.341000e+02 EOF`}, |
| 184 | {"0o12934e1", `10 9.340000e+03 EOF`}, |
| 185 | {"0o123.", `83 . EOF`}, |
| 186 | {"0o123.1", `83 1.000000e-01 EOF`}, |
alandonovan | a475931 | 2019-05-28 16:17:46 -0400 | [diff] [blame] | 187 | {"0123", `foo.star:1:5: obsolete form of octal literal; use 0o123`}, |
Alan Donovan | e3deafe | 2018-10-23 11:05:09 -0400 | [diff] [blame] | 188 | {"012834", `foo.star:1:1: invalid int literal`}, |
| 189 | {"012934", `foo.star:1:1: invalid int literal`}, |
| 190 | {"i = 012934", `foo.star:1:5: invalid int literal`}, |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 191 | // octal escapes in string literals |
| 192 | {`"\037"`, `"\x1f" EOF`}, |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 193 | {`"\377"`, `foo.star:1:1: non-ASCII octal escape \377 (use \u00FF for the UTF-8 encoding of U+00FF)`}, |
| 194 | {`"\378"`, `"\x1f8" EOF`}, // = '\37' + '8' |
| 195 | {`"\400"`, `foo.star:1:1: non-ASCII octal escape \400`}, // unlike Python 2 and 3 |
| 196 | // hex escapes |
| 197 | {`"\x00\x20\x09\x41\x7e\x7f"`, `"\x00 \tA~\x7f" EOF`}, // DEL is non-printable |
| 198 | {`"\x80"`, `foo.star:1:1: non-ASCII hex escape`}, |
| 199 | {`"\xff"`, `foo.star:1:1: non-ASCII hex escape`}, |
| 200 | {`"\xFf"`, `foo.star:1:1: non-ASCII hex escape`}, |
| 201 | {`"\xF"`, `foo.star:1:1: truncated escape sequence \xF`}, |
| 202 | {`"\x"`, `foo.star:1:1: truncated escape sequence \x`}, |
| 203 | {`"\xfg"`, `foo.star:1:1: invalid escape sequence \xfg`}, |
| 204 | // Unicode escapes |
| 205 | // \uXXXX |
| 206 | {`"\u0400"`, `"Ѐ" EOF`}, |
| 207 | {`"\u100"`, `foo.star:1:1: truncated escape sequence \u100`}, |
| 208 | {`"\u04000"`, `"Ѐ0" EOF`}, // = U+0400 + '0' |
| 209 | {`"\u100g"`, `foo.star:1:1: invalid escape sequence \u100g`}, |
| 210 | {`"\u4E16"`, `"世" EOF`}, |
| 211 | {`"\udc00"`, `foo.star:1:1: invalid Unicode code point U+DC00`}, // surrogate |
| 212 | // \UXXXXXXXX |
| 213 | {`"\U00000400"`, `"Ѐ" EOF`}, |
| 214 | {`"\U0000400"`, `foo.star:1:1: truncated escape sequence \U0000400`}, |
| 215 | {`"\U000004000"`, `"Ѐ0" EOF`}, // = U+0400 + '0' |
| 216 | {`"\U1000000g"`, `foo.star:1:1: invalid escape sequence \U1000000g`}, |
| 217 | {`"\U0010FFFF"`, `"\U0010ffff" EOF`}, |
| 218 | {`"\U00110000"`, `foo.star:1:1: code point out of range: \U00110000 (max \U00110000)`}, |
| 219 | {`"\U0001F63F"`, `"😿" EOF`}, |
| 220 | {`"\U0000dc00"`, `foo.star:1:1: invalid Unicode code point U+DC00`}, // surrogate |
alandonovan | 16e44b1 | 2020-03-26 10:23:16 -0400 | [diff] [blame] | 221 | |
| 222 | // backslash escapes |
| 223 | // As in Go, a backslash must escape something. |
| 224 | // (Python started issuing a deprecation warning in 3.6.) |
| 225 | {`"foo\(bar"`, `foo.star:1:1: invalid escape sequence \(`}, |
| 226 | {`"\+"`, `foo.star:1:1: invalid escape sequence \+`}, |
| 227 | {`"\w"`, `foo.star:1:1: invalid escape sequence \w`}, |
| 228 | {`"\""`, `"\"" EOF`}, |
alandonovan | 2319aeb | 2020-06-15 13:21:36 -0400 | [diff] [blame] | 229 | {`"\'"`, `"'" EOF`}, |
alandonovan | 16e44b1 | 2020-03-26 10:23:16 -0400 | [diff] [blame] | 230 | {`'\w'`, `foo.star:1:1: invalid escape sequence \w`}, |
| 231 | {`'\''`, `"'" EOF`}, |
alandonovan | 2319aeb | 2020-06-15 13:21:36 -0400 | [diff] [blame] | 232 | {`'\"'`, `"\"" EOF`}, |
alandonovan | 16e44b1 | 2020-03-26 10:23:16 -0400 | [diff] [blame] | 233 | {`"""\w"""`, `foo.star:1:1: invalid escape sequence \w`}, |
| 234 | {`"""\""""`, `"\"" EOF`}, |
alandonovan | 2319aeb | 2020-06-15 13:21:36 -0400 | [diff] [blame] | 235 | {`"""\'"""`, `"'" EOF`}, |
alandonovan | 16e44b1 | 2020-03-26 10:23:16 -0400 | [diff] [blame] | 236 | {`'''\w'''`, `foo.star:1:1: invalid escape sequence \w`}, |
| 237 | {`'''\''''`, `"'" EOF`}, |
alandonovan | 2319aeb | 2020-06-15 13:21:36 -0400 | [diff] [blame] | 238 | {`'''\"'''`, `"\"" EOF`}, |
alandonovan | 16e44b1 | 2020-03-26 10:23:16 -0400 | [diff] [blame] | 239 | {`r"\w"`, `"\\w" EOF`}, |
| 240 | {`r"\""`, `"\\\"" EOF`}, |
| 241 | {`r"\'"`, `"\\'" EOF`}, |
| 242 | {`r'\w'`, `"\\w" EOF`}, |
| 243 | {`r'\''`, `"\\'" EOF`}, |
| 244 | {`r'\"'`, `"\\\"" EOF`}, |
| 245 | {`'a\zb'`, `foo.star:1:1: invalid escape sequence \z`}, |
| 246 | {`"\o123"`, `foo.star:1:1: invalid escape sequence \o`}, |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 247 | // bytes literals (where they differ from text strings) |
| 248 | {`b"AЀ世😿"`, `b"AЀ世😿`}, // 1-4 byte encodings, literal |
| 249 | {`b"\x41\u0400\u4e16\U0001F63F"`, `b"AЀ世😿"`}, // same, as escapes |
| 250 | {`b"\377\378\x80\xff\xFf"`, `b"\xff\x1f8\x80\xff\xff" EOF`}, // hex/oct escapes allow non-ASCII |
| 251 | {`b"\400"`, `foo.star:1:2: invalid escape sequence \400`}, |
| 252 | {`b"\udc00"`, `foo.star:1:2: invalid Unicode code point U+DC00`}, // (same as string) |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 253 | // floats starting with octal digits |
| 254 | {"012934.", `1.293400e+04 EOF`}, |
| 255 | {"012934.1", `1.293410e+04 EOF`}, |
| 256 | {"012934e1", `1.293400e+05 EOF`}, |
| 257 | {"0123.", `1.230000e+02 EOF`}, |
| 258 | {"0123.1", `1.231000e+02 EOF`}, |
alandonovan | 7a86632 | 2018-11-21 14:57:52 -0500 | [diff] [blame] | 259 | // github.com/google/skylark/issues/16 |
Alan Donovan | e3deafe | 2018-10-23 11:05:09 -0400 | [diff] [blame] | 260 | {"x ! 0", "foo.star:1:3: unexpected input character '!'"}, |
alandonovan | f6c29bf | 2019-01-03 15:19:20 -0500 | [diff] [blame] | 261 | // github.com/google/starlark-go/issues/80 |
| 262 | {"([{<>}])", "( [ { < > } ] ) EOF"}, |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 263 | {"f();", "f ( ) ; EOF"}, |
alandonovan | c1a3d54 | 2019-01-31 13:43:01 -0500 | [diff] [blame] | 264 | // github.com/google/starlark-go/issues/104 |
| 265 | {"def f():\n if x:\n pass\n ", `def f ( ) : newline indent if x : newline indent pass newline outdent outdent EOF`}, |
| 266 | {`while cond: pass`, "while cond : pass EOF"}, |
alandonovan | 22479a3 | 2019-01-09 12:15:31 -0500 | [diff] [blame] | 267 | // github.com/google/starlark-go/issues/107 |
| 268 | {"~= ~= 5", "~ = ~ = 5 EOF"}, |
alandonovan | 988906f | 2019-08-20 13:32:00 -0400 | [diff] [blame] | 269 | {"0in", "0 in EOF"}, |
| 270 | {"0or", "foo.star:1:3: invalid octal literal"}, |
| 271 | {"6in", "6 in EOF"}, |
| 272 | {"6or", "6 or EOF"}, |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 273 | } { |
| 274 | got, err := scan(test.input) |
| 275 | if err != nil { |
Ariel Mashraki | caa37b4 | 2017-10-27 19:27:28 +0300 | [diff] [blame] | 276 | got = err.(Error).Error() |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 277 | } |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 278 | // Prefix match allows us to truncate errors in expecations. |
| 279 | // Success cases all end in EOF. |
| 280 | if !strings.HasPrefix(got, test.want) { |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 281 | t.Errorf("scan `%s` = [%s], want [%s]", test.input, got, test.want) |
| 282 | } |
| 283 | } |
| 284 | } |
| 285 | |
Alan Donovan | e3deafe | 2018-10-23 11:05:09 -0400 | [diff] [blame] | 286 | // dataFile is the same as starlarktest.DataFile. |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 287 | // We make a copy to avoid a dependency cycle. |
| 288 | var dataFile = func(pkgdir, filename string) string { |
Alan Donovan | 6beab7e | 2018-10-31 17:53:09 -0400 | [diff] [blame] | 289 | return filepath.Join(build.Default.GOPATH, "src/go.starlark.net", pkgdir, filename) |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 290 | } |
| 291 | |
| 292 | func BenchmarkScan(b *testing.B) { |
Alan Donovan | 6beab7e | 2018-10-31 17:53:09 -0400 | [diff] [blame] | 293 | filename := dataFile("syntax", "testdata/scan.star") |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 294 | b.StopTimer() |
| 295 | data, err := ioutil.ReadFile(filename) |
| 296 | if err != nil { |
| 297 | b.Fatal(err) |
| 298 | } |
| 299 | b.StartTimer() |
| 300 | |
| 301 | for i := 0; i < b.N; i++ { |
Laurent Le Brun | 689fc22 | 2018-02-22 19:37:18 +0100 | [diff] [blame] | 302 | sc, err := newScanner(filename, data, false) |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 303 | if err != nil { |
| 304 | b.Fatal(err) |
| 305 | } |
| 306 | var val tokenValue |
| 307 | for sc.nextToken(&val) != EOF { |
| 308 | } |
| 309 | } |
| 310 | } |