blob: bb4165e9d2637b4f990f3be196636750b6b16c75 [file] [log] [blame]
Alan Donovan312d1a52017-10-02 10:10:28 -04001// Copyright 2017 The Bazel Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package syntax
6
Alan Donovane3deafe2018-10-23 11:05:09 -04007// A lexical scanner for Starlark.
Alan Donovan312d1a52017-10-02 10:10:28 -04008
9import (
10 "fmt"
11 "io"
12 "io/ioutil"
13 "log"
Mohamed Elqdusy69e96152018-01-22 20:00:29 +010014 "math/big"
alandonovan2f5aafd2018-12-07 16:43:08 -050015 "os"
Alan Donovan312d1a52017-10-02 10:10:28 -040016 "strconv"
17 "strings"
18 "unicode"
19 "unicode/utf8"
20)
21
Alan Donovane3deafe2018-10-23 11:05:09 -040022// A Token represents a Starlark lexical token.
Alan Donovan312d1a52017-10-02 10:10:28 -040023type Token int8
24
25const (
26 ILLEGAL Token = iota
27 EOF
28
29 NEWLINE
30 INDENT
31 OUTDENT
32
33 // Tokens with values
34 IDENT // x
35 INT // 123
36 FLOAT // 1.23e45
37 STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo"
alandonovanebe61bd2021-02-12 16:57:32 -050038 BYTES // b"foo", etc
Alan Donovan312d1a52017-10-02 10:10:28 -040039
40 // Punctuation
41 PLUS // +
42 MINUS // -
43 STAR // *
44 SLASH // /
45 SLASHSLASH // //
46 PERCENT // %
47 AMP // &
48 PIPE // |
Hittorp0a5e39a2018-08-09 15:02:30 +030049 CIRCUMFLEX // ^
50 LTLT // <<
51 GTGT // >>
52 TILDE // ~
Alan Donovan312d1a52017-10-02 10:10:28 -040053 DOT // .
54 COMMA // ,
55 EQ // =
56 SEMI // ;
57 COLON // :
58 LPAREN // (
59 RPAREN // )
60 LBRACK // [
61 RBRACK // ]
62 LBRACE // {
63 RBRACE // }
64 LT // <
65 GT // >
66 GE // >=
67 LE // <=
68 EQL // ==
69 NEQ // !=
Hittorp0a5e39a2018-08-09 15:02:30 +030070 PLUS_EQ // += (keep order consistent with PLUS..GTGT)
Alan Donovan312d1a52017-10-02 10:10:28 -040071 MINUS_EQ // -=
72 STAR_EQ // *=
73 SLASH_EQ // /=
74 SLASHSLASH_EQ // //=
75 PERCENT_EQ // %=
Hittorp0a5e39a2018-08-09 15:02:30 +030076 AMP_EQ // &=
77 PIPE_EQ // |=
78 CIRCUMFLEX_EQ // ^=
79 LTLT_EQ // <<=
80 GTGT_EQ // >>=
Alan Donovan312d1a52017-10-02 10:10:28 -040081 STARSTAR // **
82
83 // Keywords
84 AND
85 BREAK
86 CONTINUE
87 DEF
88 ELIF
89 ELSE
90 FOR
91 IF
92 IN
93 LAMBDA
alandonovan6696fc32017-10-20 10:55:17 -040094 LOAD
Alan Donovan312d1a52017-10-02 10:10:28 -040095 NOT
96 NOT_IN // synthesized by parser from NOT IN
97 OR
98 PASS
99 RETURN
Alessandro Arzilli678bafe2018-12-07 17:28:35 +0100100 WHILE
Alan Donovan312d1a52017-10-02 10:10:28 -0400101
102 maxToken
103)
104
105func (tok Token) String() string { return tokenNames[tok] }
106
107// GoString is like String but quotes punctuation tokens.
108// Use Sprintf("%#v", tok) when constructing error messages.
109func (tok Token) GoString() string {
110 if tok >= PLUS && tok <= STARSTAR {
111 return "'" + tokenNames[tok] + "'"
112 }
113 return tokenNames[tok]
114}
115
116var tokenNames = [...]string{
117 ILLEGAL: "illegal token",
118 EOF: "end of file",
119 NEWLINE: "newline",
120 INDENT: "indent",
121 OUTDENT: "outdent",
122 IDENT: "identifier",
123 INT: "int literal",
124 FLOAT: "float literal",
125 STRING: "string literal",
126 PLUS: "+",
127 MINUS: "-",
128 STAR: "*",
129 SLASH: "/",
130 SLASHSLASH: "//",
131 PERCENT: "%",
132 AMP: "&",
133 PIPE: "|",
Hittorp0a5e39a2018-08-09 15:02:30 +0300134 CIRCUMFLEX: "^",
135 LTLT: "<<",
136 GTGT: ">>",
137 TILDE: "~",
Alan Donovan312d1a52017-10-02 10:10:28 -0400138 DOT: ".",
139 COMMA: ",",
140 EQ: "=",
141 SEMI: ";",
142 COLON: ":",
143 LPAREN: "(",
144 RPAREN: ")",
145 LBRACK: "[",
146 RBRACK: "]",
147 LBRACE: "{",
alandonovanf6c29bf2019-01-03 15:19:20 -0500148 RBRACE: "}",
Alan Donovan312d1a52017-10-02 10:10:28 -0400149 LT: "<",
150 GT: ">",
151 GE: ">=",
152 LE: "<=",
153 EQL: "==",
154 NEQ: "!=",
155 PLUS_EQ: "+=",
156 MINUS_EQ: "-=",
157 STAR_EQ: "*=",
158 SLASH_EQ: "/=",
159 SLASHSLASH_EQ: "//=",
160 PERCENT_EQ: "%=",
Hittorp0a5e39a2018-08-09 15:02:30 +0300161 AMP_EQ: "&=",
162 PIPE_EQ: "|=",
163 CIRCUMFLEX_EQ: "^=",
164 LTLT_EQ: "<<=",
165 GTGT_EQ: ">>=",
Alan Donovan312d1a52017-10-02 10:10:28 -0400166 STARSTAR: "**",
167 AND: "and",
168 BREAK: "break",
169 CONTINUE: "continue",
170 DEF: "def",
171 ELIF: "elif",
172 ELSE: "else",
173 FOR: "for",
174 IF: "if",
175 IN: "in",
176 LAMBDA: "lambda",
alandonovan6696fc32017-10-20 10:55:17 -0400177 LOAD: "load",
Alan Donovan312d1a52017-10-02 10:10:28 -0400178 NOT: "not",
179 NOT_IN: "not in",
180 OR: "or",
181 PASS: "pass",
182 RETURN: "return",
alandonovanc1a3d542019-01-31 13:43:01 -0500183 WHILE: "while",
Alan Donovan312d1a52017-10-02 10:10:28 -0400184}
185
alandonovan0a10e4f2021-02-08 12:20:22 -0500186// A FilePortion describes the content of a portion of a file.
187// Callers may provide a FilePortion for the src argument of Parse
188// when the desired initial line and column numbers are not (1, 1),
189// such as when an expression is parsed from within larger file.
190type FilePortion struct {
191 Content []byte
192 FirstLine, FirstCol int32
193}
194
Alan Donovan312d1a52017-10-02 10:10:28 -0400195// A Position describes the location of a rune of input.
196type Position struct {
197 file *string // filename (indirect for compactness)
alandonovan2494ae92019-04-04 15:38:05 -0400198 Line int32 // 1-based line number; 0 if line unknown
199 Col int32 // 1-based column (rune) number; 0 if column unknown
Alan Donovan312d1a52017-10-02 10:10:28 -0400200}
201
202// IsValid reports whether the position is valid.
alandonovan2494ae92019-04-04 15:38:05 -0400203func (p Position) IsValid() bool { return p.file != nil }
Alan Donovan312d1a52017-10-02 10:10:28 -0400204
205// Filename returns the name of the file containing this position.
206func (p Position) Filename() string {
207 if p.file != nil {
208 return *p.file
209 }
alandonovan2494ae92019-04-04 15:38:05 -0400210 return "<invalid>"
Alan Donovan312d1a52017-10-02 10:10:28 -0400211}
212
alandonovan93f3e0c2018-03-30 10:42:28 -0400213// MakePosition returns position with the specified components.
214func MakePosition(file *string, line, col int32) Position { return Position{file, line, col} }
215
Alan Donovan312d1a52017-10-02 10:10:28 -0400216// add returns the position at the end of s, assuming it starts at p.
217func (p Position) add(s string) Position {
218 if n := strings.Count(s, "\n"); n > 0 {
219 p.Line += int32(n)
220 s = s[strings.LastIndex(s, "\n")+1:]
221 p.Col = 1
222 }
223 p.Col += int32(utf8.RuneCountInString(s))
224 return p
225}
226
227func (p Position) String() string {
alandonovan2494ae92019-04-04 15:38:05 -0400228 file := p.Filename()
229 if p.Line > 0 {
230 if p.Col > 0 {
231 return fmt.Sprintf("%s:%d:%d", file, p.Line, p.Col)
232 }
233 return fmt.Sprintf("%s:%d", file, p.Line)
alandonovan93f3e0c2018-03-30 10:42:28 -0400234 }
alandonovan2494ae92019-04-04 15:38:05 -0400235 return file
Alan Donovan312d1a52017-10-02 10:10:28 -0400236}
237
Laurent Le Brun689fc222018-02-22 19:37:18 +0100238func (p Position) isBefore(q Position) bool {
239 if p.Line != q.Line {
240 return p.Line < q.Line
241 }
242 return p.Col < q.Col
Alan Donovan312d1a52017-10-02 10:10:28 -0400243}
244
Laurent Le Brun689fc222018-02-22 19:37:18 +0100245// An scanner represents a single input file being parsed.
246type scanner struct {
alandonovan30e71c62019-01-04 13:48:12 -0500247 rest []byte // rest of input (in REPL, a line of input)
Laurent Le Brun689fc222018-02-22 19:37:18 +0100248 token []byte // token being scanned
249 pos Position // current input position
250 depth int // nesting of [ ] { } ( )
251 indentstk []int // stack of indentation levels
252 dents int // number of saved INDENT (>0) or OUTDENT (<0) tokens to return
253 lineStart bool // after NEWLINE; convert spaces to indentation tokens
254 keepComments bool // accumulate comments in slice
255 lineComments []Comment // list of full line comments (if keepComments)
256 suffixComments []Comment // list of suffix comments (if keepComments)
alandonovan30e71c62019-01-04 13:48:12 -0500257
258 readline func() ([]byte, error) // read next line of input (REPL only)
Laurent Le Brun689fc222018-02-22 19:37:18 +0100259}
260
261func newScanner(filename string, src interface{}, keepComments bool) (*scanner, error) {
alandonovan0a10e4f2021-02-08 12:20:22 -0500262 var firstLine, firstCol int32 = 1, 1
263 if portion, ok := src.(FilePortion); ok {
264 firstLine, firstCol = portion.FirstLine, portion.FirstCol
265 }
alandonovan30e71c62019-01-04 13:48:12 -0500266 sc := &scanner{
alandonovan0a10e4f2021-02-08 12:20:22 -0500267 pos: MakePosition(&filename, firstLine, firstCol),
Laurent Le Brun689fc222018-02-22 19:37:18 +0100268 indentstk: make([]int, 1, 10), // []int{0} + spare capacity
269 lineStart: true,
270 keepComments: keepComments,
alandonovan30e71c62019-01-04 13:48:12 -0500271 }
alandonovanebe61bd2021-02-12 16:57:32 -0500272 sc.readline, _ = src.(func() ([]byte, error)) // ParseCompoundStmt (REPL) only
alandonovan30e71c62019-01-04 13:48:12 -0500273 if sc.readline == nil {
274 data, err := readSource(filename, src)
275 if err != nil {
276 return nil, err
277 }
278 sc.rest = data
279 }
280 return sc, nil
Alan Donovan312d1a52017-10-02 10:10:28 -0400281}
282
alandonovan2f5aafd2018-12-07 16:43:08 -0500283func readSource(filename string, src interface{}) ([]byte, error) {
Alan Donovan312d1a52017-10-02 10:10:28 -0400284 switch src := src.(type) {
285 case string:
alandonovan2f5aafd2018-12-07 16:43:08 -0500286 return []byte(src), nil
Alan Donovan312d1a52017-10-02 10:10:28 -0400287 case []byte:
alandonovan2f5aafd2018-12-07 16:43:08 -0500288 return src, nil
Alan Donovan312d1a52017-10-02 10:10:28 -0400289 case io.Reader:
alandonovan2f5aafd2018-12-07 16:43:08 -0500290 data, err := ioutil.ReadAll(src)
291 if err != nil {
292 err = &os.PathError{Op: "read", Path: filename, Err: err}
Misha Brukman40bc3a52020-01-25 19:52:27 -0500293 return nil, err
alandonovan2f5aafd2018-12-07 16:43:08 -0500294 }
295 return data, nil
alandonovan0a10e4f2021-02-08 12:20:22 -0500296 case FilePortion:
297 return src.Content, nil
Alan Donovan312d1a52017-10-02 10:10:28 -0400298 case nil:
alandonovan2f5aafd2018-12-07 16:43:08 -0500299 return ioutil.ReadFile(filename)
Alan Donovan312d1a52017-10-02 10:10:28 -0400300 default:
301 return nil, fmt.Errorf("invalid source: %T", src)
302 }
Alan Donovan312d1a52017-10-02 10:10:28 -0400303}
304
305// An Error describes the nature and position of a scanner or parser error.
306type Error struct {
307 Pos Position
308 Msg string
309}
310
311func (e Error) Error() string { return e.Pos.String() + ": " + e.Msg }
312
313// errorf is called to report an error.
314// errorf does not return: it panics.
315func (sc *scanner) error(pos Position, s string) {
316 panic(Error{pos, s})
317}
318
319func (sc *scanner) errorf(pos Position, format string, args ...interface{}) {
320 sc.error(pos, fmt.Sprintf(format, args...))
321}
322
323func (sc *scanner) recover(err *error) {
324 // The scanner and parser panic both for routine errors like
325 // syntax errors and for programmer bugs like array index
326 // errors. Turn both into error returns. Catching bug panics
327 // is especially important when processing many files.
328 switch e := recover().(type) {
329 case nil:
330 // no panic
331 case Error:
332 *err = e
333 default:
334 *err = Error{sc.pos, fmt.Sprintf("internal error: %v", e)}
335 if debug {
336 log.Fatal(*err)
337 }
338 }
339}
340
341// eof reports whether the input has reached end of file.
342func (sc *scanner) eof() bool {
alandonovan30e71c62019-01-04 13:48:12 -0500343 return len(sc.rest) == 0 && !sc.readLine()
344}
345
346// readLine attempts to read another line of input.
347// Precondition: len(sc.rest)==0.
348func (sc *scanner) readLine() bool {
349 if sc.readline != nil {
350 var err error
351 sc.rest, err = sc.readline()
352 if err != nil {
353 sc.errorf(sc.pos, "%v", err) // EOF or ErrInterrupt
354 }
355 return len(sc.rest) > 0
356 }
357 return false
Alan Donovan312d1a52017-10-02 10:10:28 -0400358}
359
360// peekRune returns the next rune in the input without consuming it.
361// Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
362func (sc *scanner) peekRune() rune {
alandonovan30e71c62019-01-04 13:48:12 -0500363 // TODO(adonovan): opt: measure and perhaps inline eof.
364 if sc.eof() {
Alan Donovan312d1a52017-10-02 10:10:28 -0400365 return 0
366 }
367
368 // fast path: ASCII
369 if b := sc.rest[0]; b < utf8.RuneSelf {
370 if b == '\r' {
371 return '\n'
372 }
373 return rune(b)
374 }
375
376 r, _ := utf8.DecodeRune(sc.rest)
377 return r
378}
379
380// readRune consumes and returns the next rune in the input.
381// Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
382func (sc *scanner) readRune() rune {
alandonovan30e71c62019-01-04 13:48:12 -0500383 // eof() has been inlined here, both to avoid a call
384 // and to establish len(rest)>0 to avoid a bounds check.
Alan Donovan312d1a52017-10-02 10:10:28 -0400385 if len(sc.rest) == 0 {
alandonovan30e71c62019-01-04 13:48:12 -0500386 if !sc.readLine() {
387 sc.error(sc.pos, "internal scanner error: readRune at EOF")
388 }
389 // Redundant, but eliminates the bounds-check below.
390 if len(sc.rest) == 0 {
391 return 0
392 }
Alan Donovan312d1a52017-10-02 10:10:28 -0400393 }
394
395 // fast path: ASCII
396 if b := sc.rest[0]; b < utf8.RuneSelf {
397 r := rune(b)
398 sc.rest = sc.rest[1:]
399 if r == '\r' {
400 if len(sc.rest) > 0 && sc.rest[0] == '\n' {
401 sc.rest = sc.rest[1:]
402 }
403 r = '\n'
404 }
405 if r == '\n' {
406 sc.pos.Line++
407 sc.pos.Col = 1
408 } else {
409 sc.pos.Col++
410 }
411 return r
412 }
413
414 r, size := utf8.DecodeRune(sc.rest)
415 sc.rest = sc.rest[size:]
416 sc.pos.Col++
417 return r
418}
419
420// tokenValue records the position and value associated with each token.
421type tokenValue struct {
422 raw string // raw text of token
423 int int64 // decoded int
Mohamed Elqdusy69e96152018-01-22 20:00:29 +0100424 bigInt *big.Int // decoded integers > int64
Alan Donovan312d1a52017-10-02 10:10:28 -0400425 float float64 // decoded float
alandonovanebe61bd2021-02-12 16:57:32 -0500426 string string // decoded string or bytes
Alan Donovan312d1a52017-10-02 10:10:28 -0400427 pos Position // start position of token
Alan Donovan312d1a52017-10-02 10:10:28 -0400428}
429
430// startToken marks the beginning of the next input token.
431// It must be followed by a call to endToken once the token has
432// been consumed using readRune.
433func (sc *scanner) startToken(val *tokenValue) {
434 sc.token = sc.rest
435 val.raw = ""
436 val.pos = sc.pos
437}
438
439// endToken marks the end of an input token.
440// It records the actual token string in val.raw if the caller
441// has not done that already.
442func (sc *scanner) endToken(val *tokenValue) {
443 if val.raw == "" {
444 val.raw = string(sc.token[:len(sc.token)-len(sc.rest)])
445 }
446}
447
448// nextToken is called by the parser to obtain the next input token.
449// It returns the token value and sets val to the data associated with
450// the token.
451//
452// For all our input tokens, the associated data is val.pos (the
453// position where the token begins), val.raw (the input string
454// corresponding to the token). For string and int tokens, the string
455// and int fields additionally contain the token's interpreted value.
456func (sc *scanner) nextToken(val *tokenValue) Token {
457
458 // The following distribution of tokens guides case ordering:
459 //
460 // COMMA 27 %
461 // STRING 23 %
462 // IDENT 15 %
463 // EQL 11 %
464 // LBRACK 5.5 %
465 // RBRACK 5.5 %
466 // NEWLINE 3 %
467 // LPAREN 2.9 %
468 // RPAREN 2.9 %
469 // INT 2 %
470 // others < 1 %
471 //
472 // Although NEWLINE tokens are infrequent, and lineStart is
473 // usually (~97%) false on entry, skipped newlines account for
474 // about 50% of all iterations of the 'start' loop.
475
476start:
477 var c rune
478
479 // Deal with leading spaces and indentation.
480 blank := false
481 savedLineStart := sc.lineStart
482 if sc.lineStart {
483 sc.lineStart = false
484 col := 0
485 for {
486 c = sc.peekRune()
487 if c == ' ' {
488 col++
489 sc.readRune()
490 } else if c == '\t' {
491 const tab = 8
492 col += int(tab - (sc.pos.Col-1)%tab)
493 sc.readRune()
494 } else {
495 break
496 }
497 }
alandonovanc1a3d542019-01-31 13:43:01 -0500498
499 // The third clause matches EOF.
500 if c == '#' || c == '\n' || c == 0 {
Alan Donovan312d1a52017-10-02 10:10:28 -0400501 blank = true
502 }
503
504 // Compute indentation level for non-blank lines not
505 // inside an expression. This is not the common case.
506 if !blank && sc.depth == 0 {
507 cur := sc.indentstk[len(sc.indentstk)-1]
508 if col > cur {
509 // indent
510 sc.dents++
511 sc.indentstk = append(sc.indentstk, col)
512 } else if col < cur {
alandonovanc1a3d542019-01-31 13:43:01 -0500513 // outdent(s)
Alan Donovan312d1a52017-10-02 10:10:28 -0400514 for len(sc.indentstk) > 0 && col < sc.indentstk[len(sc.indentstk)-1] {
515 sc.dents--
516 sc.indentstk = sc.indentstk[:len(sc.indentstk)-1] // pop
517 }
518 if col != sc.indentstk[len(sc.indentstk)-1] {
519 sc.error(sc.pos, "unindent does not match any outer indentation level")
520 }
521 }
522 }
523 }
524
525 // Return saved indentation tokens.
526 if sc.dents != 0 {
527 sc.startToken(val)
528 sc.endToken(val)
529 if sc.dents < 0 {
530 sc.dents++
531 return OUTDENT
532 } else {
533 sc.dents--
534 return INDENT
535 }
536 }
537
538 // start of line proper
539 c = sc.peekRune()
540
541 // Skip spaces.
542 for c == ' ' || c == '\t' {
543 sc.readRune()
544 c = sc.peekRune()
545 }
546
547 // comment
548 if c == '#' {
Laurent Le Brun689fc222018-02-22 19:37:18 +0100549 if sc.keepComments {
550 sc.startToken(val)
551 }
552 // Consume up to newline (included).
Alan Donovan312d1a52017-10-02 10:10:28 -0400553 for c != 0 && c != '\n' {
554 sc.readRune()
555 c = sc.peekRune()
556 }
Laurent Le Brun689fc222018-02-22 19:37:18 +0100557 if sc.keepComments {
558 sc.endToken(val)
559 if blank {
560 sc.lineComments = append(sc.lineComments, Comment{val.pos, val.raw})
561 } else {
562 sc.suffixComments = append(sc.suffixComments, Comment{val.pos, val.raw})
563 }
564 }
Alan Donovan312d1a52017-10-02 10:10:28 -0400565 }
566
567 // newline
568 if c == '\n' {
569 sc.lineStart = true
alandonovan30e71c62019-01-04 13:48:12 -0500570
571 // Ignore newlines within expressions (common case).
572 if sc.depth > 0 {
Alan Donovan312d1a52017-10-02 10:10:28 -0400573 sc.readRune()
574 goto start
575 }
alandonovan30e71c62019-01-04 13:48:12 -0500576
577 // Ignore blank lines, except in the REPL,
578 // where they emit OUTDENTs and NEWLINE.
579 if blank {
580 if sc.readline == nil {
581 sc.readRune()
582 goto start
583 } else if len(sc.indentstk) > 1 {
584 sc.dents = 1 - len(sc.indentstk)
alandonovanc1a3d542019-01-31 13:43:01 -0500585 sc.indentstk = sc.indentstk[:1]
alandonovan30e71c62019-01-04 13:48:12 -0500586 goto start
587 }
588 }
589
Alan Donovan312d1a52017-10-02 10:10:28 -0400590 // At top-level (not in an expression).
591 sc.startToken(val)
592 sc.readRune()
593 val.raw = "\n"
594 return NEWLINE
595 }
596
597 // end of file
598 if c == 0 {
599 // Emit OUTDENTs for unfinished indentation,
600 // preceded by a NEWLINE if we haven't just emitted one.
601 if len(sc.indentstk) > 1 {
602 if savedLineStart {
603 sc.dents = 1 - len(sc.indentstk)
alandonovanc1a3d542019-01-31 13:43:01 -0500604 sc.indentstk = sc.indentstk[:1]
Alan Donovan312d1a52017-10-02 10:10:28 -0400605 goto start
606 } else {
607 sc.lineStart = true
608 sc.startToken(val)
609 val.raw = "\n"
610 return NEWLINE
611 }
612 }
613
614 sc.startToken(val)
615 sc.endToken(val)
616 return EOF
617 }
618
619 // line continuation
620 if c == '\\' {
621 sc.readRune()
622 if sc.peekRune() != '\n' {
623 sc.errorf(sc.pos, "stray backslash in program")
624 }
625 sc.readRune()
626 goto start
627 }
628
629 // start of the next token
630 sc.startToken(val)
631
632 // comma (common case)
633 if c == ',' {
634 sc.readRune()
635 sc.endToken(val)
636 return COMMA
637 }
638
639 // string literal
640 if c == '"' || c == '\'' {
641 return sc.scanString(val, c)
642 }
643
644 // identifier or keyword
645 if isIdentStart(c) {
alandonovanebe61bd2021-02-12 16:57:32 -0500646 if (c == 'r' || c == 'b') && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') {
647 // r"..."
648 // b"..."
649 sc.readRune()
650 c = sc.peekRune()
651 return sc.scanString(val, c)
652 } else if c == 'r' && len(sc.rest) > 2 && sc.rest[1] == 'b' && (sc.rest[2] == '"' || sc.rest[2] == '\'') {
653 // rb"..."
654 sc.readRune()
Alan Donovan312d1a52017-10-02 10:10:28 -0400655 sc.readRune()
656 c = sc.peekRune()
657 return sc.scanString(val, c)
658 }
659
660 for isIdent(c) {
661 sc.readRune()
662 c = sc.peekRune()
663 }
664 sc.endToken(val)
665 if k, ok := keywordToken[val.raw]; ok {
666 return k
667 }
668
669 return IDENT
670 }
671
672 // brackets
673 switch c {
674 case '[', '(', '{':
675 sc.depth++
676 sc.readRune()
677 sc.endToken(val)
678 switch c {
679 case '[':
680 return LBRACK
681 case '(':
682 return LPAREN
683 case '{':
684 return LBRACE
685 }
686 panic("unreachable")
687
688 case ']', ')', '}':
689 if sc.depth == 0 {
alandonovan990a7962018-12-17 12:42:51 -0500690 sc.errorf(sc.pos, "unexpected %q", c)
Alan Donovan312d1a52017-10-02 10:10:28 -0400691 } else {
692 sc.depth--
693 }
694 sc.readRune()
695 sc.endToken(val)
696 switch c {
697 case ']':
698 return RBRACK
699 case ')':
700 return RPAREN
701 case '}':
702 return RBRACE
703 }
704 panic("unreachable")
705 }
706
707 // int or float literal, or period
708 if isdigit(c) || c == '.' {
709 return sc.scanNumber(val, c)
710 }
711
712 // other punctuation
713 defer sc.endToken(val)
714 switch c {
alandonovan22479a32019-01-09 12:15:31 -0500715 case '=', '<', '>', '!', '+', '-', '%', '/', '&', '|', '^': // possibly followed by '='
Ariel Mashrakicaa37b42017-10-27 19:27:28 +0300716 start := sc.pos
Alan Donovan312d1a52017-10-02 10:10:28 -0400717 sc.readRune()
718 if sc.peekRune() == '=' {
719 sc.readRune()
720 switch c {
721 case '<':
722 return LE
723 case '>':
724 return GE
725 case '=':
726 return EQL
727 case '!':
728 return NEQ
729 case '+':
730 return PLUS_EQ
731 case '-':
732 return MINUS_EQ
733 case '/':
734 return SLASH_EQ
735 case '%':
736 return PERCENT_EQ
Hittorp0a5e39a2018-08-09 15:02:30 +0300737 case '&':
738 return AMP_EQ
739 case '|':
740 return PIPE_EQ
741 case '^':
742 return CIRCUMFLEX_EQ
Alan Donovan312d1a52017-10-02 10:10:28 -0400743 }
744 }
745 switch c {
746 case '=':
747 return EQ
748 case '<':
Hittorp0a5e39a2018-08-09 15:02:30 +0300749 if sc.peekRune() == '<' {
750 sc.readRune()
751 if sc.peekRune() == '=' {
752 sc.readRune()
753 return LTLT_EQ
754 } else {
755 return LTLT
756 }
757 }
Alan Donovan312d1a52017-10-02 10:10:28 -0400758 return LT
759 case '>':
Hittorp0a5e39a2018-08-09 15:02:30 +0300760 if sc.peekRune() == '>' {
761 sc.readRune()
762 if sc.peekRune() == '=' {
763 sc.readRune()
764 return GTGT_EQ
765 } else {
766 return GTGT
767 }
768 }
Alan Donovan312d1a52017-10-02 10:10:28 -0400769 return GT
770 case '!':
Ariel Mashrakicaa37b42017-10-27 19:27:28 +0300771 sc.error(start, "unexpected input character '!'")
Alan Donovan312d1a52017-10-02 10:10:28 -0400772 case '+':
773 return PLUS
774 case '-':
775 return MINUS
776 case '/':
777 if sc.peekRune() == '/' {
778 sc.readRune()
779 if sc.peekRune() == '=' {
780 sc.readRune()
781 return SLASHSLASH_EQ
782 } else {
783 return SLASHSLASH
784 }
785 }
786 return SLASH
787 case '%':
788 return PERCENT
Hittorp0a5e39a2018-08-09 15:02:30 +0300789 case '&':
790 return AMP
791 case '|':
792 return PIPE
793 case '^':
794 return CIRCUMFLEX
Alan Donovan312d1a52017-10-02 10:10:28 -0400795 }
796 panic("unreachable")
797
alandonovan22479a32019-01-09 12:15:31 -0500798 case ':', ';', '~': // single-char tokens (except comma)
Alan Donovan312d1a52017-10-02 10:10:28 -0400799 sc.readRune()
800 switch c {
801 case ':':
802 return COLON
803 case ';':
804 return SEMI
alandonovan22479a32019-01-09 12:15:31 -0500805 case '~':
806 return TILDE
Alan Donovan312d1a52017-10-02 10:10:28 -0400807 }
808 panic("unreachable")
809
810 case '*': // possibly followed by '*' or '='
811 sc.readRune()
812 switch sc.peekRune() {
813 case '*':
814 sc.readRune()
815 return STARSTAR
816 case '=':
817 sc.readRune()
818 return STAR_EQ
819 }
820 return STAR
821 }
822
823 sc.errorf(sc.pos, "unexpected input character %#q", c)
824 panic("unreachable")
825}
826
827func (sc *scanner) scanString(val *tokenValue, quote rune) Token {
Ariel Mashrakicaa37b42017-10-27 19:27:28 +0300828 start := sc.pos
Alan Donovan312d1a52017-10-02 10:10:28 -0400829 triple := len(sc.rest) >= 3 && sc.rest[0] == byte(quote) && sc.rest[1] == byte(quote) && sc.rest[2] == byte(quote)
830 sc.readRune()
alandonovane8819e82020-03-26 17:56:36 -0400831
832 // String literals may contain escaped or unescaped newlines,
833 // causing them to span multiple lines (gulps) of REPL input;
834 // they are the only such token. Thus we cannot call endToken,
835 // as it assumes sc.rest is unchanged since startToken.
836 // Instead, buffer the token here.
837 // TODO(adonovan): opt: buffer only if we encounter a newline.
838 raw := new(strings.Builder)
839
840 // Copy the prefix, e.g. r' or " (see startToken).
841 raw.Write(sc.token[:len(sc.token)-len(sc.rest)])
842
alandonovan30e71c62019-01-04 13:48:12 -0500843 if !triple {
alandonovane8819e82020-03-26 17:56:36 -0400844 // single-quoted string literal
alandonovan30e71c62019-01-04 13:48:12 -0500845 for {
Alan Donovan312d1a52017-10-02 10:10:28 -0400846 if sc.eof() {
847 sc.error(val.pos, "unexpected EOF in string")
848 }
alandonovan30e71c62019-01-04 13:48:12 -0500849 c := sc.readRune()
alandonovane8819e82020-03-26 17:56:36 -0400850 raw.WriteRune(c)
alandonovan30e71c62019-01-04 13:48:12 -0500851 if c == quote {
852 break
853 }
854 if c == '\n' {
855 sc.error(val.pos, "unexpected newline in string")
856 }
857 if c == '\\' {
858 if sc.eof() {
859 sc.error(val.pos, "unexpected EOF in string")
860 }
alandonovane8819e82020-03-26 17:56:36 -0400861 c = sc.readRune()
862 raw.WriteRune(c)
alandonovan30e71c62019-01-04 13:48:12 -0500863 }
Alan Donovan312d1a52017-10-02 10:10:28 -0400864 }
alandonovan30e71c62019-01-04 13:48:12 -0500865 } else {
866 // triple-quoted string literal
867 sc.readRune()
alandonovane8819e82020-03-26 17:56:36 -0400868 raw.WriteRune(quote)
alandonovan30e71c62019-01-04 13:48:12 -0500869 sc.readRune()
alandonovane8819e82020-03-26 17:56:36 -0400870 raw.WriteRune(quote)
alandonovan30e71c62019-01-04 13:48:12 -0500871
872 quoteCount := 0
873 for {
874 if sc.eof() {
875 sc.error(val.pos, "unexpected EOF in string")
876 }
877 c := sc.readRune()
878 raw.WriteRune(c)
879 if c == quote {
880 quoteCount++
881 if quoteCount == 3 {
882 break
883 }
884 } else {
885 quoteCount = 0
886 }
887 if c == '\\' {
888 if sc.eof() {
889 sc.error(val.pos, "unexpected EOF in string")
890 }
891 c = sc.readRune()
892 raw.WriteRune(c)
893 }
894 }
Alan Donovan312d1a52017-10-02 10:10:28 -0400895 }
alandonovane8819e82020-03-26 17:56:36 -0400896 val.raw = raw.String()
Alan Donovan312d1a52017-10-02 10:10:28 -0400897
alandonovanebe61bd2021-02-12 16:57:32 -0500898 s, _, isByte, err := unquote(val.raw)
Alan Donovan312d1a52017-10-02 10:10:28 -0400899 if err != nil {
Ariel Mashrakicaa37b42017-10-27 19:27:28 +0300900 sc.error(start, err.Error())
Alan Donovan312d1a52017-10-02 10:10:28 -0400901 }
902 val.string = s
alandonovanebe61bd2021-02-12 16:57:32 -0500903 if isByte {
904 return BYTES
905 } else {
906 return STRING
907 }
Alan Donovan312d1a52017-10-02 10:10:28 -0400908}
909
910func (sc *scanner) scanNumber(val *tokenValue, c rune) Token {
alandonovan04aba6e2018-11-05 17:45:33 -0500911 // https://github.com/google/starlark-go/blob/master/doc/spec.md#lexical-elements
alandonovan5ce1e422017-10-17 15:20:32 -0400912 //
913 // Python features not supported:
Alan Donovan312d1a52017-10-02 10:10:28 -0400914 // - integer literals of >64 bits of precision
915 // - 123L or 123l long suffix
916 // - traditional octal: 0755
alandonovan5ce1e422017-10-17 15:20:32 -0400917 // https://docs.python.org/2/reference/lexical_analysis.html#integer-and-long-integer-literals
Alan Donovan312d1a52017-10-02 10:10:28 -0400918
Ariel Mashrakicaa37b42017-10-27 19:27:28 +0300919 start := sc.pos
Alan Donovan312d1a52017-10-02 10:10:28 -0400920 fraction, exponent := false, false
921
922 if c == '.' {
923 // dot or start of fraction
924 sc.readRune()
925 c = sc.peekRune()
926 if !isdigit(c) {
927 sc.endToken(val)
928 return DOT
929 }
930 fraction = true
931 } else if c == '0' {
Mohamed Elqdusy3b32df92018-01-08 17:20:46 +0100932 // hex, octal, binary or float
Alan Donovan312d1a52017-10-02 10:10:28 -0400933 sc.readRune()
934 c = sc.peekRune()
935
936 if c == '.' {
937 fraction = true
938 } else if c == 'x' || c == 'X' {
939 // hex
940 sc.readRune()
941 c = sc.peekRune()
942 if !isxdigit(c) {
Ariel Mashrakicaa37b42017-10-27 19:27:28 +0300943 sc.error(start, "invalid hex literal")
Alan Donovan312d1a52017-10-02 10:10:28 -0400944 }
945 for isxdigit(c) {
946 sc.readRune()
947 c = sc.peekRune()
948 }
949 } else if c == 'o' || c == 'O' {
950 // octal
951 sc.readRune()
952 c = sc.peekRune()
953 if !isodigit(c) {
954 sc.error(sc.pos, "invalid octal literal")
955 }
956 for isodigit(c) {
957 sc.readRune()
958 c = sc.peekRune()
959 }
Mohamed Elqdusy3b32df92018-01-08 17:20:46 +0100960 } else if c == 'b' || c == 'B' {
961 // binary
962 sc.readRune()
963 c = sc.peekRune()
964 if !isbdigit(c) {
965 sc.error(sc.pos, "invalid binary literal")
966 }
967 for isbdigit(c) {
968 sc.readRune()
969 c = sc.peekRune()
970 }
Alan Donovan312d1a52017-10-02 10:10:28 -0400971 } else {
972 // float (or obsolete octal "0755")
973 allzeros, octal := true, true
974 for isdigit(c) {
975 if c != '0' {
976 allzeros = false
977 }
978 if c > '7' {
979 octal = false
980 }
981 sc.readRune()
982 c = sc.peekRune()
983 }
984 if c == '.' {
985 fraction = true
986 } else if c == 'e' || c == 'E' {
987 exponent = true
988 } else if octal && !allzeros {
alandonovana4759312019-05-28 16:17:46 -0400989 sc.endToken(val)
990 sc.errorf(sc.pos, "obsolete form of octal literal; use 0o%s", val.raw[1:])
Alan Donovan312d1a52017-10-02 10:10:28 -0400991 }
992 }
993 } else {
994 // decimal
995 for isdigit(c) {
996 sc.readRune()
997 c = sc.peekRune()
998 }
999
1000 if c == '.' {
1001 fraction = true
1002 } else if c == 'e' || c == 'E' {
1003 exponent = true
1004 }
1005 }
1006
1007 if fraction {
1008 sc.readRune() // consume '.'
1009 c = sc.peekRune()
1010 for isdigit(c) {
1011 sc.readRune()
1012 c = sc.peekRune()
1013 }
1014
1015 if c == 'e' || c == 'E' {
1016 exponent = true
1017 }
1018 }
1019
1020 if exponent {
1021 sc.readRune() // consume [eE]
1022 c = sc.peekRune()
1023 if c == '+' || c == '-' {
1024 sc.readRune()
1025 c = sc.peekRune()
1026 if !isdigit(c) {
1027 sc.error(sc.pos, "invalid float literal")
1028 }
1029 }
1030 for isdigit(c) {
1031 sc.readRune()
1032 c = sc.peekRune()
1033 }
1034 }
1035
1036 sc.endToken(val)
1037 if fraction || exponent {
1038 var err error
1039 val.float, err = strconv.ParseFloat(val.raw, 64)
1040 if err != nil {
1041 sc.error(sc.pos, "invalid float literal")
1042 }
1043 return FLOAT
1044 } else {
1045 var err error
1046 s := val.raw
Mohamed Elqdusy69e96152018-01-22 20:00:29 +01001047 val.bigInt = nil
Alan Donovan312d1a52017-10-02 10:10:28 -04001048 if len(s) > 2 && s[0] == '0' && (s[1] == 'o' || s[1] == 'O') {
1049 val.int, err = strconv.ParseInt(s[2:], 8, 64)
Mohamed Elqdusy3b32df92018-01-08 17:20:46 +01001050 } else if len(s) > 2 && s[0] == '0' && (s[1] == 'b' || s[1] == 'B') {
1051 val.int, err = strconv.ParseInt(s[2:], 2, 64)
Alan Donovan312d1a52017-10-02 10:10:28 -04001052 } else {
1053 val.int, err = strconv.ParseInt(s, 0, 64)
Mohamed Elqdusy69e96152018-01-22 20:00:29 +01001054 if err != nil {
1055 num := new(big.Int)
Misha Brukman40bc3a52020-01-25 19:52:27 -05001056 var ok bool
Mohamed Elqdusy69e96152018-01-22 20:00:29 +01001057 val.bigInt, ok = num.SetString(s, 0)
1058 if ok {
1059 err = nil
1060 }
1061 }
Alan Donovan312d1a52017-10-02 10:10:28 -04001062 }
1063 if err != nil {
Ariel Mashrakicaa37b42017-10-27 19:27:28 +03001064 sc.error(start, "invalid int literal")
Alan Donovan312d1a52017-10-02 10:10:28 -04001065 }
1066 return INT
1067 }
1068}
1069
1070// isIdent reports whether c is an identifier rune.
1071func isIdent(c rune) bool {
1072 return isdigit(c) || isIdentStart(c)
1073}
1074
1075func isIdentStart(c rune) bool {
1076 return 'a' <= c && c <= 'z' ||
1077 'A' <= c && c <= 'Z' ||
1078 c == '_' ||
1079 unicode.IsLetter(c)
1080}
1081
1082func isdigit(c rune) bool { return '0' <= c && c <= '9' }
1083func isodigit(c rune) bool { return '0' <= c && c <= '7' }
1084func isxdigit(c rune) bool { return isdigit(c) || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' }
Mohamed Elqdusy3b32df92018-01-08 17:20:46 +01001085func isbdigit(c rune) bool { return '0' == c || c == '1' }
Alan Donovan312d1a52017-10-02 10:10:28 -04001086
1087// keywordToken records the special tokens for
1088// strings that should not be treated as ordinary identifiers.
1089var keywordToken = map[string]Token{
1090 "and": AND,
1091 "break": BREAK,
1092 "continue": CONTINUE,
1093 "def": DEF,
1094 "elif": ELIF,
1095 "else": ELSE,
1096 "for": FOR,
1097 "if": IF,
1098 "in": IN,
1099 "lambda": LAMBDA,
alandonovan6696fc32017-10-20 10:55:17 -04001100 "load": LOAD,
Alan Donovan312d1a52017-10-02 10:10:28 -04001101 "not": NOT,
1102 "or": OR,
1103 "pass": PASS,
1104 "return": RETURN,
Alessandro Arzilli678bafe2018-12-07 17:28:35 +01001105 "while": WHILE,
Alan Donovan312d1a52017-10-02 10:10:28 -04001106
1107 // reserved words:
1108 "as": ILLEGAL,
1109 // "assert": ILLEGAL, // heavily used by our tests
1110 "class": ILLEGAL,
1111 "del": ILLEGAL,
1112 "except": ILLEGAL,
1113 "finally": ILLEGAL,
1114 "from": ILLEGAL,
1115 "global": ILLEGAL,
1116 "import": ILLEGAL,
1117 "is": ILLEGAL,
1118 "nonlocal": ILLEGAL,
1119 "raise": ILLEGAL,
1120 "try": ILLEGAL,
Alan Donovan312d1a52017-10-02 10:10:28 -04001121 "with": ILLEGAL,
1122 "yield": ILLEGAL,
1123}