Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 1 | // Copyright 2017 The Bazel Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package syntax |
| 6 | |
Alan Donovan | e3deafe | 2018-10-23 11:05:09 -0400 | [diff] [blame] | 7 | // A lexical scanner for Starlark. |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 8 | |
| 9 | import ( |
| 10 | "fmt" |
| 11 | "io" |
| 12 | "io/ioutil" |
| 13 | "log" |
Mohamed Elqdusy | 69e9615 | 2018-01-22 20:00:29 +0100 | [diff] [blame] | 14 | "math/big" |
alandonovan | 2f5aafd | 2018-12-07 16:43:08 -0500 | [diff] [blame] | 15 | "os" |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 16 | "strconv" |
| 17 | "strings" |
| 18 | "unicode" |
| 19 | "unicode/utf8" |
| 20 | ) |
| 21 | |
Alan Donovan | e3deafe | 2018-10-23 11:05:09 -0400 | [diff] [blame] | 22 | // A Token represents a Starlark lexical token. |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 23 | type Token int8 |
| 24 | |
| 25 | const ( |
| 26 | ILLEGAL Token = iota |
| 27 | EOF |
| 28 | |
| 29 | NEWLINE |
| 30 | INDENT |
| 31 | OUTDENT |
| 32 | |
| 33 | // Tokens with values |
| 34 | IDENT // x |
| 35 | INT // 123 |
| 36 | FLOAT // 1.23e45 |
| 37 | STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo" |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 38 | BYTES // b"foo", etc |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 39 | |
| 40 | // Punctuation |
| 41 | PLUS // + |
| 42 | MINUS // - |
| 43 | STAR // * |
| 44 | SLASH // / |
| 45 | SLASHSLASH // // |
| 46 | PERCENT // % |
| 47 | AMP // & |
| 48 | PIPE // | |
Hittorp | 0a5e39a | 2018-08-09 15:02:30 +0300 | [diff] [blame] | 49 | CIRCUMFLEX // ^ |
| 50 | LTLT // << |
| 51 | GTGT // >> |
| 52 | TILDE // ~ |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 53 | DOT // . |
| 54 | COMMA // , |
| 55 | EQ // = |
| 56 | SEMI // ; |
| 57 | COLON // : |
| 58 | LPAREN // ( |
| 59 | RPAREN // ) |
| 60 | LBRACK // [ |
| 61 | RBRACK // ] |
| 62 | LBRACE // { |
| 63 | RBRACE // } |
| 64 | LT // < |
| 65 | GT // > |
| 66 | GE // >= |
| 67 | LE // <= |
| 68 | EQL // == |
| 69 | NEQ // != |
Hittorp | 0a5e39a | 2018-08-09 15:02:30 +0300 | [diff] [blame] | 70 | PLUS_EQ // += (keep order consistent with PLUS..GTGT) |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 71 | MINUS_EQ // -= |
| 72 | STAR_EQ // *= |
| 73 | SLASH_EQ // /= |
| 74 | SLASHSLASH_EQ // //= |
| 75 | PERCENT_EQ // %= |
Hittorp | 0a5e39a | 2018-08-09 15:02:30 +0300 | [diff] [blame] | 76 | AMP_EQ // &= |
| 77 | PIPE_EQ // |= |
| 78 | CIRCUMFLEX_EQ // ^= |
| 79 | LTLT_EQ // <<= |
| 80 | GTGT_EQ // >>= |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 81 | STARSTAR // ** |
| 82 | |
| 83 | // Keywords |
| 84 | AND |
| 85 | BREAK |
| 86 | CONTINUE |
| 87 | DEF |
| 88 | ELIF |
| 89 | ELSE |
| 90 | FOR |
| 91 | IF |
| 92 | IN |
| 93 | LAMBDA |
alandonovan | 6696fc3 | 2017-10-20 10:55:17 -0400 | [diff] [blame] | 94 | LOAD |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 95 | NOT |
| 96 | NOT_IN // synthesized by parser from NOT IN |
| 97 | OR |
| 98 | PASS |
| 99 | RETURN |
Alessandro Arzilli | 678bafe | 2018-12-07 17:28:35 +0100 | [diff] [blame] | 100 | WHILE |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 101 | |
| 102 | maxToken |
| 103 | ) |
| 104 | |
| 105 | func (tok Token) String() string { return tokenNames[tok] } |
| 106 | |
| 107 | // GoString is like String but quotes punctuation tokens. |
| 108 | // Use Sprintf("%#v", tok) when constructing error messages. |
| 109 | func (tok Token) GoString() string { |
| 110 | if tok >= PLUS && tok <= STARSTAR { |
| 111 | return "'" + tokenNames[tok] + "'" |
| 112 | } |
| 113 | return tokenNames[tok] |
| 114 | } |
| 115 | |
| 116 | var tokenNames = [...]string{ |
| 117 | ILLEGAL: "illegal token", |
| 118 | EOF: "end of file", |
| 119 | NEWLINE: "newline", |
| 120 | INDENT: "indent", |
| 121 | OUTDENT: "outdent", |
| 122 | IDENT: "identifier", |
| 123 | INT: "int literal", |
| 124 | FLOAT: "float literal", |
| 125 | STRING: "string literal", |
| 126 | PLUS: "+", |
| 127 | MINUS: "-", |
| 128 | STAR: "*", |
| 129 | SLASH: "/", |
| 130 | SLASHSLASH: "//", |
| 131 | PERCENT: "%", |
| 132 | AMP: "&", |
| 133 | PIPE: "|", |
Hittorp | 0a5e39a | 2018-08-09 15:02:30 +0300 | [diff] [blame] | 134 | CIRCUMFLEX: "^", |
| 135 | LTLT: "<<", |
| 136 | GTGT: ">>", |
| 137 | TILDE: "~", |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 138 | DOT: ".", |
| 139 | COMMA: ",", |
| 140 | EQ: "=", |
| 141 | SEMI: ";", |
| 142 | COLON: ":", |
| 143 | LPAREN: "(", |
| 144 | RPAREN: ")", |
| 145 | LBRACK: "[", |
| 146 | RBRACK: "]", |
| 147 | LBRACE: "{", |
alandonovan | f6c29bf | 2019-01-03 15:19:20 -0500 | [diff] [blame] | 148 | RBRACE: "}", |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 149 | LT: "<", |
| 150 | GT: ">", |
| 151 | GE: ">=", |
| 152 | LE: "<=", |
| 153 | EQL: "==", |
| 154 | NEQ: "!=", |
| 155 | PLUS_EQ: "+=", |
| 156 | MINUS_EQ: "-=", |
| 157 | STAR_EQ: "*=", |
| 158 | SLASH_EQ: "/=", |
| 159 | SLASHSLASH_EQ: "//=", |
| 160 | PERCENT_EQ: "%=", |
Hittorp | 0a5e39a | 2018-08-09 15:02:30 +0300 | [diff] [blame] | 161 | AMP_EQ: "&=", |
| 162 | PIPE_EQ: "|=", |
| 163 | CIRCUMFLEX_EQ: "^=", |
| 164 | LTLT_EQ: "<<=", |
| 165 | GTGT_EQ: ">>=", |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 166 | STARSTAR: "**", |
| 167 | AND: "and", |
| 168 | BREAK: "break", |
| 169 | CONTINUE: "continue", |
| 170 | DEF: "def", |
| 171 | ELIF: "elif", |
| 172 | ELSE: "else", |
| 173 | FOR: "for", |
| 174 | IF: "if", |
| 175 | IN: "in", |
| 176 | LAMBDA: "lambda", |
alandonovan | 6696fc3 | 2017-10-20 10:55:17 -0400 | [diff] [blame] | 177 | LOAD: "load", |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 178 | NOT: "not", |
| 179 | NOT_IN: "not in", |
| 180 | OR: "or", |
| 181 | PASS: "pass", |
| 182 | RETURN: "return", |
alandonovan | c1a3d54 | 2019-01-31 13:43:01 -0500 | [diff] [blame] | 183 | WHILE: "while", |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 184 | } |
| 185 | |
alandonovan | 0a10e4f | 2021-02-08 12:20:22 -0500 | [diff] [blame] | 186 | // A FilePortion describes the content of a portion of a file. |
| 187 | // Callers may provide a FilePortion for the src argument of Parse |
| 188 | // when the desired initial line and column numbers are not (1, 1), |
| 189 | // such as when an expression is parsed from within larger file. |
| 190 | type FilePortion struct { |
| 191 | Content []byte |
| 192 | FirstLine, FirstCol int32 |
| 193 | } |
| 194 | |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 195 | // A Position describes the location of a rune of input. |
| 196 | type Position struct { |
| 197 | file *string // filename (indirect for compactness) |
alandonovan | 2494ae9 | 2019-04-04 15:38:05 -0400 | [diff] [blame] | 198 | Line int32 // 1-based line number; 0 if line unknown |
| 199 | Col int32 // 1-based column (rune) number; 0 if column unknown |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 200 | } |
| 201 | |
| 202 | // IsValid reports whether the position is valid. |
alandonovan | 2494ae9 | 2019-04-04 15:38:05 -0400 | [diff] [blame] | 203 | func (p Position) IsValid() bool { return p.file != nil } |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 204 | |
| 205 | // Filename returns the name of the file containing this position. |
| 206 | func (p Position) Filename() string { |
| 207 | if p.file != nil { |
| 208 | return *p.file |
| 209 | } |
alandonovan | 2494ae9 | 2019-04-04 15:38:05 -0400 | [diff] [blame] | 210 | return "<invalid>" |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 211 | } |
| 212 | |
alandonovan | 93f3e0c | 2018-03-30 10:42:28 -0400 | [diff] [blame] | 213 | // MakePosition returns position with the specified components. |
| 214 | func MakePosition(file *string, line, col int32) Position { return Position{file, line, col} } |
| 215 | |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 216 | // add returns the position at the end of s, assuming it starts at p. |
| 217 | func (p Position) add(s string) Position { |
| 218 | if n := strings.Count(s, "\n"); n > 0 { |
| 219 | p.Line += int32(n) |
| 220 | s = s[strings.LastIndex(s, "\n")+1:] |
| 221 | p.Col = 1 |
| 222 | } |
| 223 | p.Col += int32(utf8.RuneCountInString(s)) |
| 224 | return p |
| 225 | } |
| 226 | |
| 227 | func (p Position) String() string { |
alandonovan | 2494ae9 | 2019-04-04 15:38:05 -0400 | [diff] [blame] | 228 | file := p.Filename() |
| 229 | if p.Line > 0 { |
| 230 | if p.Col > 0 { |
| 231 | return fmt.Sprintf("%s:%d:%d", file, p.Line, p.Col) |
| 232 | } |
| 233 | return fmt.Sprintf("%s:%d", file, p.Line) |
alandonovan | 93f3e0c | 2018-03-30 10:42:28 -0400 | [diff] [blame] | 234 | } |
alandonovan | 2494ae9 | 2019-04-04 15:38:05 -0400 | [diff] [blame] | 235 | return file |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 236 | } |
| 237 | |
Laurent Le Brun | 689fc22 | 2018-02-22 19:37:18 +0100 | [diff] [blame] | 238 | func (p Position) isBefore(q Position) bool { |
| 239 | if p.Line != q.Line { |
| 240 | return p.Line < q.Line |
| 241 | } |
| 242 | return p.Col < q.Col |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 243 | } |
| 244 | |
Laurent Le Brun | 689fc22 | 2018-02-22 19:37:18 +0100 | [diff] [blame] | 245 | // An scanner represents a single input file being parsed. |
| 246 | type scanner struct { |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 247 | rest []byte // rest of input (in REPL, a line of input) |
Laurent Le Brun | 689fc22 | 2018-02-22 19:37:18 +0100 | [diff] [blame] | 248 | token []byte // token being scanned |
| 249 | pos Position // current input position |
| 250 | depth int // nesting of [ ] { } ( ) |
| 251 | indentstk []int // stack of indentation levels |
| 252 | dents int // number of saved INDENT (>0) or OUTDENT (<0) tokens to return |
| 253 | lineStart bool // after NEWLINE; convert spaces to indentation tokens |
| 254 | keepComments bool // accumulate comments in slice |
| 255 | lineComments []Comment // list of full line comments (if keepComments) |
| 256 | suffixComments []Comment // list of suffix comments (if keepComments) |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 257 | |
| 258 | readline func() ([]byte, error) // read next line of input (REPL only) |
Laurent Le Brun | 689fc22 | 2018-02-22 19:37:18 +0100 | [diff] [blame] | 259 | } |
| 260 | |
| 261 | func newScanner(filename string, src interface{}, keepComments bool) (*scanner, error) { |
alandonovan | 0a10e4f | 2021-02-08 12:20:22 -0500 | [diff] [blame] | 262 | var firstLine, firstCol int32 = 1, 1 |
| 263 | if portion, ok := src.(FilePortion); ok { |
| 264 | firstLine, firstCol = portion.FirstLine, portion.FirstCol |
| 265 | } |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 266 | sc := &scanner{ |
alandonovan | 0a10e4f | 2021-02-08 12:20:22 -0500 | [diff] [blame] | 267 | pos: MakePosition(&filename, firstLine, firstCol), |
Laurent Le Brun | 689fc22 | 2018-02-22 19:37:18 +0100 | [diff] [blame] | 268 | indentstk: make([]int, 1, 10), // []int{0} + spare capacity |
| 269 | lineStart: true, |
| 270 | keepComments: keepComments, |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 271 | } |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 272 | sc.readline, _ = src.(func() ([]byte, error)) // ParseCompoundStmt (REPL) only |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 273 | if sc.readline == nil { |
| 274 | data, err := readSource(filename, src) |
| 275 | if err != nil { |
| 276 | return nil, err |
| 277 | } |
| 278 | sc.rest = data |
| 279 | } |
| 280 | return sc, nil |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 281 | } |
| 282 | |
alandonovan | 2f5aafd | 2018-12-07 16:43:08 -0500 | [diff] [blame] | 283 | func readSource(filename string, src interface{}) ([]byte, error) { |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 284 | switch src := src.(type) { |
| 285 | case string: |
alandonovan | 2f5aafd | 2018-12-07 16:43:08 -0500 | [diff] [blame] | 286 | return []byte(src), nil |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 287 | case []byte: |
alandonovan | 2f5aafd | 2018-12-07 16:43:08 -0500 | [diff] [blame] | 288 | return src, nil |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 289 | case io.Reader: |
alandonovan | 2f5aafd | 2018-12-07 16:43:08 -0500 | [diff] [blame] | 290 | data, err := ioutil.ReadAll(src) |
| 291 | if err != nil { |
| 292 | err = &os.PathError{Op: "read", Path: filename, Err: err} |
Misha Brukman | 40bc3a5 | 2020-01-25 19:52:27 -0500 | [diff] [blame] | 293 | return nil, err |
alandonovan | 2f5aafd | 2018-12-07 16:43:08 -0500 | [diff] [blame] | 294 | } |
| 295 | return data, nil |
alandonovan | 0a10e4f | 2021-02-08 12:20:22 -0500 | [diff] [blame] | 296 | case FilePortion: |
| 297 | return src.Content, nil |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 298 | case nil: |
alandonovan | 2f5aafd | 2018-12-07 16:43:08 -0500 | [diff] [blame] | 299 | return ioutil.ReadFile(filename) |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 300 | default: |
| 301 | return nil, fmt.Errorf("invalid source: %T", src) |
| 302 | } |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 303 | } |
| 304 | |
| 305 | // An Error describes the nature and position of a scanner or parser error. |
| 306 | type Error struct { |
| 307 | Pos Position |
| 308 | Msg string |
| 309 | } |
| 310 | |
| 311 | func (e Error) Error() string { return e.Pos.String() + ": " + e.Msg } |
| 312 | |
| 313 | // errorf is called to report an error. |
| 314 | // errorf does not return: it panics. |
| 315 | func (sc *scanner) error(pos Position, s string) { |
| 316 | panic(Error{pos, s}) |
| 317 | } |
| 318 | |
| 319 | func (sc *scanner) errorf(pos Position, format string, args ...interface{}) { |
| 320 | sc.error(pos, fmt.Sprintf(format, args...)) |
| 321 | } |
| 322 | |
| 323 | func (sc *scanner) recover(err *error) { |
| 324 | // The scanner and parser panic both for routine errors like |
| 325 | // syntax errors and for programmer bugs like array index |
| 326 | // errors. Turn both into error returns. Catching bug panics |
| 327 | // is especially important when processing many files. |
| 328 | switch e := recover().(type) { |
| 329 | case nil: |
| 330 | // no panic |
| 331 | case Error: |
| 332 | *err = e |
| 333 | default: |
| 334 | *err = Error{sc.pos, fmt.Sprintf("internal error: %v", e)} |
| 335 | if debug { |
| 336 | log.Fatal(*err) |
| 337 | } |
| 338 | } |
| 339 | } |
| 340 | |
| 341 | // eof reports whether the input has reached end of file. |
| 342 | func (sc *scanner) eof() bool { |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 343 | return len(sc.rest) == 0 && !sc.readLine() |
| 344 | } |
| 345 | |
| 346 | // readLine attempts to read another line of input. |
| 347 | // Precondition: len(sc.rest)==0. |
| 348 | func (sc *scanner) readLine() bool { |
| 349 | if sc.readline != nil { |
| 350 | var err error |
| 351 | sc.rest, err = sc.readline() |
| 352 | if err != nil { |
| 353 | sc.errorf(sc.pos, "%v", err) // EOF or ErrInterrupt |
| 354 | } |
| 355 | return len(sc.rest) > 0 |
| 356 | } |
| 357 | return false |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 358 | } |
| 359 | |
| 360 | // peekRune returns the next rune in the input without consuming it. |
| 361 | // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'. |
| 362 | func (sc *scanner) peekRune() rune { |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 363 | // TODO(adonovan): opt: measure and perhaps inline eof. |
| 364 | if sc.eof() { |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 365 | return 0 |
| 366 | } |
| 367 | |
| 368 | // fast path: ASCII |
| 369 | if b := sc.rest[0]; b < utf8.RuneSelf { |
| 370 | if b == '\r' { |
| 371 | return '\n' |
| 372 | } |
| 373 | return rune(b) |
| 374 | } |
| 375 | |
| 376 | r, _ := utf8.DecodeRune(sc.rest) |
| 377 | return r |
| 378 | } |
| 379 | |
| 380 | // readRune consumes and returns the next rune in the input. |
| 381 | // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'. |
| 382 | func (sc *scanner) readRune() rune { |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 383 | // eof() has been inlined here, both to avoid a call |
| 384 | // and to establish len(rest)>0 to avoid a bounds check. |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 385 | if len(sc.rest) == 0 { |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 386 | if !sc.readLine() { |
| 387 | sc.error(sc.pos, "internal scanner error: readRune at EOF") |
| 388 | } |
| 389 | // Redundant, but eliminates the bounds-check below. |
| 390 | if len(sc.rest) == 0 { |
| 391 | return 0 |
| 392 | } |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 393 | } |
| 394 | |
| 395 | // fast path: ASCII |
| 396 | if b := sc.rest[0]; b < utf8.RuneSelf { |
| 397 | r := rune(b) |
| 398 | sc.rest = sc.rest[1:] |
| 399 | if r == '\r' { |
| 400 | if len(sc.rest) > 0 && sc.rest[0] == '\n' { |
| 401 | sc.rest = sc.rest[1:] |
| 402 | } |
| 403 | r = '\n' |
| 404 | } |
| 405 | if r == '\n' { |
| 406 | sc.pos.Line++ |
| 407 | sc.pos.Col = 1 |
| 408 | } else { |
| 409 | sc.pos.Col++ |
| 410 | } |
| 411 | return r |
| 412 | } |
| 413 | |
| 414 | r, size := utf8.DecodeRune(sc.rest) |
| 415 | sc.rest = sc.rest[size:] |
| 416 | sc.pos.Col++ |
| 417 | return r |
| 418 | } |
| 419 | |
| 420 | // tokenValue records the position and value associated with each token. |
| 421 | type tokenValue struct { |
| 422 | raw string // raw text of token |
| 423 | int int64 // decoded int |
Mohamed Elqdusy | 69e9615 | 2018-01-22 20:00:29 +0100 | [diff] [blame] | 424 | bigInt *big.Int // decoded integers > int64 |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 425 | float float64 // decoded float |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 426 | string string // decoded string or bytes |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 427 | pos Position // start position of token |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 428 | } |
| 429 | |
| 430 | // startToken marks the beginning of the next input token. |
| 431 | // It must be followed by a call to endToken once the token has |
| 432 | // been consumed using readRune. |
| 433 | func (sc *scanner) startToken(val *tokenValue) { |
| 434 | sc.token = sc.rest |
| 435 | val.raw = "" |
| 436 | val.pos = sc.pos |
| 437 | } |
| 438 | |
| 439 | // endToken marks the end of an input token. |
| 440 | // It records the actual token string in val.raw if the caller |
| 441 | // has not done that already. |
| 442 | func (sc *scanner) endToken(val *tokenValue) { |
| 443 | if val.raw == "" { |
| 444 | val.raw = string(sc.token[:len(sc.token)-len(sc.rest)]) |
| 445 | } |
| 446 | } |
| 447 | |
| 448 | // nextToken is called by the parser to obtain the next input token. |
| 449 | // It returns the token value and sets val to the data associated with |
| 450 | // the token. |
| 451 | // |
| 452 | // For all our input tokens, the associated data is val.pos (the |
| 453 | // position where the token begins), val.raw (the input string |
| 454 | // corresponding to the token). For string and int tokens, the string |
| 455 | // and int fields additionally contain the token's interpreted value. |
| 456 | func (sc *scanner) nextToken(val *tokenValue) Token { |
| 457 | |
| 458 | // The following distribution of tokens guides case ordering: |
| 459 | // |
| 460 | // COMMA 27 % |
| 461 | // STRING 23 % |
| 462 | // IDENT 15 % |
| 463 | // EQL 11 % |
| 464 | // LBRACK 5.5 % |
| 465 | // RBRACK 5.5 % |
| 466 | // NEWLINE 3 % |
| 467 | // LPAREN 2.9 % |
| 468 | // RPAREN 2.9 % |
| 469 | // INT 2 % |
| 470 | // others < 1 % |
| 471 | // |
| 472 | // Although NEWLINE tokens are infrequent, and lineStart is |
| 473 | // usually (~97%) false on entry, skipped newlines account for |
| 474 | // about 50% of all iterations of the 'start' loop. |
| 475 | |
| 476 | start: |
| 477 | var c rune |
| 478 | |
| 479 | // Deal with leading spaces and indentation. |
| 480 | blank := false |
| 481 | savedLineStart := sc.lineStart |
| 482 | if sc.lineStart { |
| 483 | sc.lineStart = false |
| 484 | col := 0 |
| 485 | for { |
| 486 | c = sc.peekRune() |
| 487 | if c == ' ' { |
| 488 | col++ |
| 489 | sc.readRune() |
| 490 | } else if c == '\t' { |
| 491 | const tab = 8 |
| 492 | col += int(tab - (sc.pos.Col-1)%tab) |
| 493 | sc.readRune() |
| 494 | } else { |
| 495 | break |
| 496 | } |
| 497 | } |
alandonovan | c1a3d54 | 2019-01-31 13:43:01 -0500 | [diff] [blame] | 498 | |
| 499 | // The third clause matches EOF. |
| 500 | if c == '#' || c == '\n' || c == 0 { |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 501 | blank = true |
| 502 | } |
| 503 | |
| 504 | // Compute indentation level for non-blank lines not |
| 505 | // inside an expression. This is not the common case. |
| 506 | if !blank && sc.depth == 0 { |
| 507 | cur := sc.indentstk[len(sc.indentstk)-1] |
| 508 | if col > cur { |
| 509 | // indent |
| 510 | sc.dents++ |
| 511 | sc.indentstk = append(sc.indentstk, col) |
| 512 | } else if col < cur { |
alandonovan | c1a3d54 | 2019-01-31 13:43:01 -0500 | [diff] [blame] | 513 | // outdent(s) |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 514 | for len(sc.indentstk) > 0 && col < sc.indentstk[len(sc.indentstk)-1] { |
| 515 | sc.dents-- |
| 516 | sc.indentstk = sc.indentstk[:len(sc.indentstk)-1] // pop |
| 517 | } |
| 518 | if col != sc.indentstk[len(sc.indentstk)-1] { |
| 519 | sc.error(sc.pos, "unindent does not match any outer indentation level") |
| 520 | } |
| 521 | } |
| 522 | } |
| 523 | } |
| 524 | |
| 525 | // Return saved indentation tokens. |
| 526 | if sc.dents != 0 { |
| 527 | sc.startToken(val) |
| 528 | sc.endToken(val) |
| 529 | if sc.dents < 0 { |
| 530 | sc.dents++ |
| 531 | return OUTDENT |
| 532 | } else { |
| 533 | sc.dents-- |
| 534 | return INDENT |
| 535 | } |
| 536 | } |
| 537 | |
| 538 | // start of line proper |
| 539 | c = sc.peekRune() |
| 540 | |
| 541 | // Skip spaces. |
| 542 | for c == ' ' || c == '\t' { |
| 543 | sc.readRune() |
| 544 | c = sc.peekRune() |
| 545 | } |
| 546 | |
| 547 | // comment |
| 548 | if c == '#' { |
Laurent Le Brun | 689fc22 | 2018-02-22 19:37:18 +0100 | [diff] [blame] | 549 | if sc.keepComments { |
| 550 | sc.startToken(val) |
| 551 | } |
| 552 | // Consume up to newline (included). |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 553 | for c != 0 && c != '\n' { |
| 554 | sc.readRune() |
| 555 | c = sc.peekRune() |
| 556 | } |
Laurent Le Brun | 689fc22 | 2018-02-22 19:37:18 +0100 | [diff] [blame] | 557 | if sc.keepComments { |
| 558 | sc.endToken(val) |
| 559 | if blank { |
| 560 | sc.lineComments = append(sc.lineComments, Comment{val.pos, val.raw}) |
| 561 | } else { |
| 562 | sc.suffixComments = append(sc.suffixComments, Comment{val.pos, val.raw}) |
| 563 | } |
| 564 | } |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 565 | } |
| 566 | |
| 567 | // newline |
| 568 | if c == '\n' { |
| 569 | sc.lineStart = true |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 570 | |
| 571 | // Ignore newlines within expressions (common case). |
| 572 | if sc.depth > 0 { |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 573 | sc.readRune() |
| 574 | goto start |
| 575 | } |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 576 | |
| 577 | // Ignore blank lines, except in the REPL, |
| 578 | // where they emit OUTDENTs and NEWLINE. |
| 579 | if blank { |
| 580 | if sc.readline == nil { |
| 581 | sc.readRune() |
| 582 | goto start |
| 583 | } else if len(sc.indentstk) > 1 { |
| 584 | sc.dents = 1 - len(sc.indentstk) |
alandonovan | c1a3d54 | 2019-01-31 13:43:01 -0500 | [diff] [blame] | 585 | sc.indentstk = sc.indentstk[:1] |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 586 | goto start |
| 587 | } |
| 588 | } |
| 589 | |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 590 | // At top-level (not in an expression). |
| 591 | sc.startToken(val) |
| 592 | sc.readRune() |
| 593 | val.raw = "\n" |
| 594 | return NEWLINE |
| 595 | } |
| 596 | |
| 597 | // end of file |
| 598 | if c == 0 { |
| 599 | // Emit OUTDENTs for unfinished indentation, |
| 600 | // preceded by a NEWLINE if we haven't just emitted one. |
| 601 | if len(sc.indentstk) > 1 { |
| 602 | if savedLineStart { |
| 603 | sc.dents = 1 - len(sc.indentstk) |
alandonovan | c1a3d54 | 2019-01-31 13:43:01 -0500 | [diff] [blame] | 604 | sc.indentstk = sc.indentstk[:1] |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 605 | goto start |
| 606 | } else { |
| 607 | sc.lineStart = true |
| 608 | sc.startToken(val) |
| 609 | val.raw = "\n" |
| 610 | return NEWLINE |
| 611 | } |
| 612 | } |
| 613 | |
| 614 | sc.startToken(val) |
| 615 | sc.endToken(val) |
| 616 | return EOF |
| 617 | } |
| 618 | |
| 619 | // line continuation |
| 620 | if c == '\\' { |
| 621 | sc.readRune() |
| 622 | if sc.peekRune() != '\n' { |
| 623 | sc.errorf(sc.pos, "stray backslash in program") |
| 624 | } |
| 625 | sc.readRune() |
| 626 | goto start |
| 627 | } |
| 628 | |
| 629 | // start of the next token |
| 630 | sc.startToken(val) |
| 631 | |
| 632 | // comma (common case) |
| 633 | if c == ',' { |
| 634 | sc.readRune() |
| 635 | sc.endToken(val) |
| 636 | return COMMA |
| 637 | } |
| 638 | |
| 639 | // string literal |
| 640 | if c == '"' || c == '\'' { |
| 641 | return sc.scanString(val, c) |
| 642 | } |
| 643 | |
| 644 | // identifier or keyword |
| 645 | if isIdentStart(c) { |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 646 | if (c == 'r' || c == 'b') && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') { |
| 647 | // r"..." |
| 648 | // b"..." |
| 649 | sc.readRune() |
| 650 | c = sc.peekRune() |
| 651 | return sc.scanString(val, c) |
| 652 | } else if c == 'r' && len(sc.rest) > 2 && sc.rest[1] == 'b' && (sc.rest[2] == '"' || sc.rest[2] == '\'') { |
| 653 | // rb"..." |
| 654 | sc.readRune() |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 655 | sc.readRune() |
| 656 | c = sc.peekRune() |
| 657 | return sc.scanString(val, c) |
| 658 | } |
| 659 | |
| 660 | for isIdent(c) { |
| 661 | sc.readRune() |
| 662 | c = sc.peekRune() |
| 663 | } |
| 664 | sc.endToken(val) |
| 665 | if k, ok := keywordToken[val.raw]; ok { |
| 666 | return k |
| 667 | } |
| 668 | |
| 669 | return IDENT |
| 670 | } |
| 671 | |
| 672 | // brackets |
| 673 | switch c { |
| 674 | case '[', '(', '{': |
| 675 | sc.depth++ |
| 676 | sc.readRune() |
| 677 | sc.endToken(val) |
| 678 | switch c { |
| 679 | case '[': |
| 680 | return LBRACK |
| 681 | case '(': |
| 682 | return LPAREN |
| 683 | case '{': |
| 684 | return LBRACE |
| 685 | } |
| 686 | panic("unreachable") |
| 687 | |
| 688 | case ']', ')', '}': |
| 689 | if sc.depth == 0 { |
alandonovan | 990a796 | 2018-12-17 12:42:51 -0500 | [diff] [blame] | 690 | sc.errorf(sc.pos, "unexpected %q", c) |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 691 | } else { |
| 692 | sc.depth-- |
| 693 | } |
| 694 | sc.readRune() |
| 695 | sc.endToken(val) |
| 696 | switch c { |
| 697 | case ']': |
| 698 | return RBRACK |
| 699 | case ')': |
| 700 | return RPAREN |
| 701 | case '}': |
| 702 | return RBRACE |
| 703 | } |
| 704 | panic("unreachable") |
| 705 | } |
| 706 | |
| 707 | // int or float literal, or period |
| 708 | if isdigit(c) || c == '.' { |
| 709 | return sc.scanNumber(val, c) |
| 710 | } |
| 711 | |
| 712 | // other punctuation |
| 713 | defer sc.endToken(val) |
| 714 | switch c { |
alandonovan | 22479a3 | 2019-01-09 12:15:31 -0500 | [diff] [blame] | 715 | case '=', '<', '>', '!', '+', '-', '%', '/', '&', '|', '^': // possibly followed by '=' |
Ariel Mashraki | caa37b4 | 2017-10-27 19:27:28 +0300 | [diff] [blame] | 716 | start := sc.pos |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 717 | sc.readRune() |
| 718 | if sc.peekRune() == '=' { |
| 719 | sc.readRune() |
| 720 | switch c { |
| 721 | case '<': |
| 722 | return LE |
| 723 | case '>': |
| 724 | return GE |
| 725 | case '=': |
| 726 | return EQL |
| 727 | case '!': |
| 728 | return NEQ |
| 729 | case '+': |
| 730 | return PLUS_EQ |
| 731 | case '-': |
| 732 | return MINUS_EQ |
| 733 | case '/': |
| 734 | return SLASH_EQ |
| 735 | case '%': |
| 736 | return PERCENT_EQ |
Hittorp | 0a5e39a | 2018-08-09 15:02:30 +0300 | [diff] [blame] | 737 | case '&': |
| 738 | return AMP_EQ |
| 739 | case '|': |
| 740 | return PIPE_EQ |
| 741 | case '^': |
| 742 | return CIRCUMFLEX_EQ |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 743 | } |
| 744 | } |
| 745 | switch c { |
| 746 | case '=': |
| 747 | return EQ |
| 748 | case '<': |
Hittorp | 0a5e39a | 2018-08-09 15:02:30 +0300 | [diff] [blame] | 749 | if sc.peekRune() == '<' { |
| 750 | sc.readRune() |
| 751 | if sc.peekRune() == '=' { |
| 752 | sc.readRune() |
| 753 | return LTLT_EQ |
| 754 | } else { |
| 755 | return LTLT |
| 756 | } |
| 757 | } |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 758 | return LT |
| 759 | case '>': |
Hittorp | 0a5e39a | 2018-08-09 15:02:30 +0300 | [diff] [blame] | 760 | if sc.peekRune() == '>' { |
| 761 | sc.readRune() |
| 762 | if sc.peekRune() == '=' { |
| 763 | sc.readRune() |
| 764 | return GTGT_EQ |
| 765 | } else { |
| 766 | return GTGT |
| 767 | } |
| 768 | } |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 769 | return GT |
| 770 | case '!': |
Ariel Mashraki | caa37b4 | 2017-10-27 19:27:28 +0300 | [diff] [blame] | 771 | sc.error(start, "unexpected input character '!'") |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 772 | case '+': |
| 773 | return PLUS |
| 774 | case '-': |
| 775 | return MINUS |
| 776 | case '/': |
| 777 | if sc.peekRune() == '/' { |
| 778 | sc.readRune() |
| 779 | if sc.peekRune() == '=' { |
| 780 | sc.readRune() |
| 781 | return SLASHSLASH_EQ |
| 782 | } else { |
| 783 | return SLASHSLASH |
| 784 | } |
| 785 | } |
| 786 | return SLASH |
| 787 | case '%': |
| 788 | return PERCENT |
Hittorp | 0a5e39a | 2018-08-09 15:02:30 +0300 | [diff] [blame] | 789 | case '&': |
| 790 | return AMP |
| 791 | case '|': |
| 792 | return PIPE |
| 793 | case '^': |
| 794 | return CIRCUMFLEX |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 795 | } |
| 796 | panic("unreachable") |
| 797 | |
alandonovan | 22479a3 | 2019-01-09 12:15:31 -0500 | [diff] [blame] | 798 | case ':', ';', '~': // single-char tokens (except comma) |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 799 | sc.readRune() |
| 800 | switch c { |
| 801 | case ':': |
| 802 | return COLON |
| 803 | case ';': |
| 804 | return SEMI |
alandonovan | 22479a3 | 2019-01-09 12:15:31 -0500 | [diff] [blame] | 805 | case '~': |
| 806 | return TILDE |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 807 | } |
| 808 | panic("unreachable") |
| 809 | |
| 810 | case '*': // possibly followed by '*' or '=' |
| 811 | sc.readRune() |
| 812 | switch sc.peekRune() { |
| 813 | case '*': |
| 814 | sc.readRune() |
| 815 | return STARSTAR |
| 816 | case '=': |
| 817 | sc.readRune() |
| 818 | return STAR_EQ |
| 819 | } |
| 820 | return STAR |
| 821 | } |
| 822 | |
| 823 | sc.errorf(sc.pos, "unexpected input character %#q", c) |
| 824 | panic("unreachable") |
| 825 | } |
| 826 | |
| 827 | func (sc *scanner) scanString(val *tokenValue, quote rune) Token { |
Ariel Mashraki | caa37b4 | 2017-10-27 19:27:28 +0300 | [diff] [blame] | 828 | start := sc.pos |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 829 | triple := len(sc.rest) >= 3 && sc.rest[0] == byte(quote) && sc.rest[1] == byte(quote) && sc.rest[2] == byte(quote) |
| 830 | sc.readRune() |
alandonovan | e8819e8 | 2020-03-26 17:56:36 -0400 | [diff] [blame] | 831 | |
| 832 | // String literals may contain escaped or unescaped newlines, |
| 833 | // causing them to span multiple lines (gulps) of REPL input; |
| 834 | // they are the only such token. Thus we cannot call endToken, |
| 835 | // as it assumes sc.rest is unchanged since startToken. |
| 836 | // Instead, buffer the token here. |
| 837 | // TODO(adonovan): opt: buffer only if we encounter a newline. |
| 838 | raw := new(strings.Builder) |
| 839 | |
| 840 | // Copy the prefix, e.g. r' or " (see startToken). |
| 841 | raw.Write(sc.token[:len(sc.token)-len(sc.rest)]) |
| 842 | |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 843 | if !triple { |
alandonovan | e8819e8 | 2020-03-26 17:56:36 -0400 | [diff] [blame] | 844 | // single-quoted string literal |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 845 | for { |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 846 | if sc.eof() { |
| 847 | sc.error(val.pos, "unexpected EOF in string") |
| 848 | } |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 849 | c := sc.readRune() |
alandonovan | e8819e8 | 2020-03-26 17:56:36 -0400 | [diff] [blame] | 850 | raw.WriteRune(c) |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 851 | if c == quote { |
| 852 | break |
| 853 | } |
| 854 | if c == '\n' { |
| 855 | sc.error(val.pos, "unexpected newline in string") |
| 856 | } |
| 857 | if c == '\\' { |
| 858 | if sc.eof() { |
| 859 | sc.error(val.pos, "unexpected EOF in string") |
| 860 | } |
alandonovan | e8819e8 | 2020-03-26 17:56:36 -0400 | [diff] [blame] | 861 | c = sc.readRune() |
| 862 | raw.WriteRune(c) |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 863 | } |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 864 | } |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 865 | } else { |
| 866 | // triple-quoted string literal |
| 867 | sc.readRune() |
alandonovan | e8819e8 | 2020-03-26 17:56:36 -0400 | [diff] [blame] | 868 | raw.WriteRune(quote) |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 869 | sc.readRune() |
alandonovan | e8819e8 | 2020-03-26 17:56:36 -0400 | [diff] [blame] | 870 | raw.WriteRune(quote) |
alandonovan | 30e71c6 | 2019-01-04 13:48:12 -0500 | [diff] [blame] | 871 | |
| 872 | quoteCount := 0 |
| 873 | for { |
| 874 | if sc.eof() { |
| 875 | sc.error(val.pos, "unexpected EOF in string") |
| 876 | } |
| 877 | c := sc.readRune() |
| 878 | raw.WriteRune(c) |
| 879 | if c == quote { |
| 880 | quoteCount++ |
| 881 | if quoteCount == 3 { |
| 882 | break |
| 883 | } |
| 884 | } else { |
| 885 | quoteCount = 0 |
| 886 | } |
| 887 | if c == '\\' { |
| 888 | if sc.eof() { |
| 889 | sc.error(val.pos, "unexpected EOF in string") |
| 890 | } |
| 891 | c = sc.readRune() |
| 892 | raw.WriteRune(c) |
| 893 | } |
| 894 | } |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 895 | } |
alandonovan | e8819e8 | 2020-03-26 17:56:36 -0400 | [diff] [blame] | 896 | val.raw = raw.String() |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 897 | |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 898 | s, _, isByte, err := unquote(val.raw) |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 899 | if err != nil { |
Ariel Mashraki | caa37b4 | 2017-10-27 19:27:28 +0300 | [diff] [blame] | 900 | sc.error(start, err.Error()) |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 901 | } |
| 902 | val.string = s |
alandonovan | ebe61bd | 2021-02-12 16:57:32 -0500 | [diff] [blame] | 903 | if isByte { |
| 904 | return BYTES |
| 905 | } else { |
| 906 | return STRING |
| 907 | } |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 908 | } |
| 909 | |
| 910 | func (sc *scanner) scanNumber(val *tokenValue, c rune) Token { |
alandonovan | 04aba6e | 2018-11-05 17:45:33 -0500 | [diff] [blame] | 911 | // https://github.com/google/starlark-go/blob/master/doc/spec.md#lexical-elements |
alandonovan | 5ce1e42 | 2017-10-17 15:20:32 -0400 | [diff] [blame] | 912 | // |
| 913 | // Python features not supported: |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 914 | // - integer literals of >64 bits of precision |
| 915 | // - 123L or 123l long suffix |
| 916 | // - traditional octal: 0755 |
alandonovan | 5ce1e42 | 2017-10-17 15:20:32 -0400 | [diff] [blame] | 917 | // https://docs.python.org/2/reference/lexical_analysis.html#integer-and-long-integer-literals |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 918 | |
Ariel Mashraki | caa37b4 | 2017-10-27 19:27:28 +0300 | [diff] [blame] | 919 | start := sc.pos |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 920 | fraction, exponent := false, false |
| 921 | |
| 922 | if c == '.' { |
| 923 | // dot or start of fraction |
| 924 | sc.readRune() |
| 925 | c = sc.peekRune() |
| 926 | if !isdigit(c) { |
| 927 | sc.endToken(val) |
| 928 | return DOT |
| 929 | } |
| 930 | fraction = true |
| 931 | } else if c == '0' { |
Mohamed Elqdusy | 3b32df9 | 2018-01-08 17:20:46 +0100 | [diff] [blame] | 932 | // hex, octal, binary or float |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 933 | sc.readRune() |
| 934 | c = sc.peekRune() |
| 935 | |
| 936 | if c == '.' { |
| 937 | fraction = true |
| 938 | } else if c == 'x' || c == 'X' { |
| 939 | // hex |
| 940 | sc.readRune() |
| 941 | c = sc.peekRune() |
| 942 | if !isxdigit(c) { |
Ariel Mashraki | caa37b4 | 2017-10-27 19:27:28 +0300 | [diff] [blame] | 943 | sc.error(start, "invalid hex literal") |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 944 | } |
| 945 | for isxdigit(c) { |
| 946 | sc.readRune() |
| 947 | c = sc.peekRune() |
| 948 | } |
| 949 | } else if c == 'o' || c == 'O' { |
| 950 | // octal |
| 951 | sc.readRune() |
| 952 | c = sc.peekRune() |
| 953 | if !isodigit(c) { |
| 954 | sc.error(sc.pos, "invalid octal literal") |
| 955 | } |
| 956 | for isodigit(c) { |
| 957 | sc.readRune() |
| 958 | c = sc.peekRune() |
| 959 | } |
Mohamed Elqdusy | 3b32df9 | 2018-01-08 17:20:46 +0100 | [diff] [blame] | 960 | } else if c == 'b' || c == 'B' { |
| 961 | // binary |
| 962 | sc.readRune() |
| 963 | c = sc.peekRune() |
| 964 | if !isbdigit(c) { |
| 965 | sc.error(sc.pos, "invalid binary literal") |
| 966 | } |
| 967 | for isbdigit(c) { |
| 968 | sc.readRune() |
| 969 | c = sc.peekRune() |
| 970 | } |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 971 | } else { |
| 972 | // float (or obsolete octal "0755") |
| 973 | allzeros, octal := true, true |
| 974 | for isdigit(c) { |
| 975 | if c != '0' { |
| 976 | allzeros = false |
| 977 | } |
| 978 | if c > '7' { |
| 979 | octal = false |
| 980 | } |
| 981 | sc.readRune() |
| 982 | c = sc.peekRune() |
| 983 | } |
| 984 | if c == '.' { |
| 985 | fraction = true |
| 986 | } else if c == 'e' || c == 'E' { |
| 987 | exponent = true |
| 988 | } else if octal && !allzeros { |
alandonovan | a475931 | 2019-05-28 16:17:46 -0400 | [diff] [blame] | 989 | sc.endToken(val) |
| 990 | sc.errorf(sc.pos, "obsolete form of octal literal; use 0o%s", val.raw[1:]) |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 991 | } |
| 992 | } |
| 993 | } else { |
| 994 | // decimal |
| 995 | for isdigit(c) { |
| 996 | sc.readRune() |
| 997 | c = sc.peekRune() |
| 998 | } |
| 999 | |
| 1000 | if c == '.' { |
| 1001 | fraction = true |
| 1002 | } else if c == 'e' || c == 'E' { |
| 1003 | exponent = true |
| 1004 | } |
| 1005 | } |
| 1006 | |
| 1007 | if fraction { |
| 1008 | sc.readRune() // consume '.' |
| 1009 | c = sc.peekRune() |
| 1010 | for isdigit(c) { |
| 1011 | sc.readRune() |
| 1012 | c = sc.peekRune() |
| 1013 | } |
| 1014 | |
| 1015 | if c == 'e' || c == 'E' { |
| 1016 | exponent = true |
| 1017 | } |
| 1018 | } |
| 1019 | |
| 1020 | if exponent { |
| 1021 | sc.readRune() // consume [eE] |
| 1022 | c = sc.peekRune() |
| 1023 | if c == '+' || c == '-' { |
| 1024 | sc.readRune() |
| 1025 | c = sc.peekRune() |
| 1026 | if !isdigit(c) { |
| 1027 | sc.error(sc.pos, "invalid float literal") |
| 1028 | } |
| 1029 | } |
| 1030 | for isdigit(c) { |
| 1031 | sc.readRune() |
| 1032 | c = sc.peekRune() |
| 1033 | } |
| 1034 | } |
| 1035 | |
| 1036 | sc.endToken(val) |
| 1037 | if fraction || exponent { |
| 1038 | var err error |
| 1039 | val.float, err = strconv.ParseFloat(val.raw, 64) |
| 1040 | if err != nil { |
| 1041 | sc.error(sc.pos, "invalid float literal") |
| 1042 | } |
| 1043 | return FLOAT |
| 1044 | } else { |
| 1045 | var err error |
| 1046 | s := val.raw |
Mohamed Elqdusy | 69e9615 | 2018-01-22 20:00:29 +0100 | [diff] [blame] | 1047 | val.bigInt = nil |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 1048 | if len(s) > 2 && s[0] == '0' && (s[1] == 'o' || s[1] == 'O') { |
| 1049 | val.int, err = strconv.ParseInt(s[2:], 8, 64) |
Mohamed Elqdusy | 3b32df9 | 2018-01-08 17:20:46 +0100 | [diff] [blame] | 1050 | } else if len(s) > 2 && s[0] == '0' && (s[1] == 'b' || s[1] == 'B') { |
| 1051 | val.int, err = strconv.ParseInt(s[2:], 2, 64) |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 1052 | } else { |
| 1053 | val.int, err = strconv.ParseInt(s, 0, 64) |
Mohamed Elqdusy | 69e9615 | 2018-01-22 20:00:29 +0100 | [diff] [blame] | 1054 | if err != nil { |
| 1055 | num := new(big.Int) |
Misha Brukman | 40bc3a5 | 2020-01-25 19:52:27 -0500 | [diff] [blame] | 1056 | var ok bool |
Mohamed Elqdusy | 69e9615 | 2018-01-22 20:00:29 +0100 | [diff] [blame] | 1057 | val.bigInt, ok = num.SetString(s, 0) |
| 1058 | if ok { |
| 1059 | err = nil |
| 1060 | } |
| 1061 | } |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 1062 | } |
| 1063 | if err != nil { |
Ariel Mashraki | caa37b4 | 2017-10-27 19:27:28 +0300 | [diff] [blame] | 1064 | sc.error(start, "invalid int literal") |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 1065 | } |
| 1066 | return INT |
| 1067 | } |
| 1068 | } |
| 1069 | |
| 1070 | // isIdent reports whether c is an identifier rune. |
| 1071 | func isIdent(c rune) bool { |
| 1072 | return isdigit(c) || isIdentStart(c) |
| 1073 | } |
| 1074 | |
| 1075 | func isIdentStart(c rune) bool { |
| 1076 | return 'a' <= c && c <= 'z' || |
| 1077 | 'A' <= c && c <= 'Z' || |
| 1078 | c == '_' || |
| 1079 | unicode.IsLetter(c) |
| 1080 | } |
| 1081 | |
| 1082 | func isdigit(c rune) bool { return '0' <= c && c <= '9' } |
| 1083 | func isodigit(c rune) bool { return '0' <= c && c <= '7' } |
| 1084 | func isxdigit(c rune) bool { return isdigit(c) || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' } |
Mohamed Elqdusy | 3b32df9 | 2018-01-08 17:20:46 +0100 | [diff] [blame] | 1085 | func isbdigit(c rune) bool { return '0' == c || c == '1' } |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 1086 | |
| 1087 | // keywordToken records the special tokens for |
| 1088 | // strings that should not be treated as ordinary identifiers. |
| 1089 | var keywordToken = map[string]Token{ |
| 1090 | "and": AND, |
| 1091 | "break": BREAK, |
| 1092 | "continue": CONTINUE, |
| 1093 | "def": DEF, |
| 1094 | "elif": ELIF, |
| 1095 | "else": ELSE, |
| 1096 | "for": FOR, |
| 1097 | "if": IF, |
| 1098 | "in": IN, |
| 1099 | "lambda": LAMBDA, |
alandonovan | 6696fc3 | 2017-10-20 10:55:17 -0400 | [diff] [blame] | 1100 | "load": LOAD, |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 1101 | "not": NOT, |
| 1102 | "or": OR, |
| 1103 | "pass": PASS, |
| 1104 | "return": RETURN, |
Alessandro Arzilli | 678bafe | 2018-12-07 17:28:35 +0100 | [diff] [blame] | 1105 | "while": WHILE, |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 1106 | |
| 1107 | // reserved words: |
| 1108 | "as": ILLEGAL, |
| 1109 | // "assert": ILLEGAL, // heavily used by our tests |
| 1110 | "class": ILLEGAL, |
| 1111 | "del": ILLEGAL, |
| 1112 | "except": ILLEGAL, |
| 1113 | "finally": ILLEGAL, |
| 1114 | "from": ILLEGAL, |
| 1115 | "global": ILLEGAL, |
| 1116 | "import": ILLEGAL, |
| 1117 | "is": ILLEGAL, |
| 1118 | "nonlocal": ILLEGAL, |
| 1119 | "raise": ILLEGAL, |
| 1120 | "try": ILLEGAL, |
Alan Donovan | 312d1a5 | 2017-10-02 10:10:28 -0400 | [diff] [blame] | 1121 | "with": ILLEGAL, |
| 1122 | "yield": ILLEGAL, |
| 1123 | } |