this repo has no description
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

cue/scanner: add package

Change-Id: I60859ae7a1de096793e6b2f86676e8bdc426caf6

+2020
+831
cue/scanner/scanner.go
··· 1 + // Copyright 2018 The CUE Authors 2 + // 3 + // Licensed under the Apache License, Version 2.0 (the "License"); 4 + // you may not use this file except in compliance with the License. 5 + // You may obtain a copy of the License at 6 + // 7 + // http://www.apache.org/licenses/LICENSE-2.0 8 + // 9 + // Unless required by applicable law or agreed to in writing, software 10 + // distributed under the License is distributed on an "AS IS" BASIS, 11 + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 + // See the License for the specific language governing permissions and 13 + // limitations under the License. 14 + 15 + // Package scanner implements a scanner for CUE source text. It takes a []byte 16 + // as source which can then be tokenized through repeated calls to the Scan 17 + // method. 18 + package scanner // import "cuelang.org/go/cue/scanner" 19 + 20 + import ( 21 + "bytes" 22 + "fmt" 23 + "path/filepath" 24 + "strconv" 25 + "unicode" 26 + "unicode/utf8" 27 + 28 + "cuelang.org/go/cue/errors" 29 + "cuelang.org/go/cue/token" 30 + ) 31 + 32 + // A Scanner holds the Scanner's internal state while processing 33 + // a given text. It can be allocated as part of another data 34 + // structure but must be initialized via Init before use. 35 + type Scanner struct { 36 + // immutable state 37 + file *token.File // source file handle 38 + dir string // directory portion of file.Name() 39 + src []byte // source 40 + err errors.Handler // error reporting; or nil 41 + mode Mode // scanning mode 42 + 43 + // scanning state 44 + ch rune // current character 45 + offset int // character offset 46 + rdOffset int // reading offset (position after current character) 47 + lineOffset int // current line offset 48 + linesSinceLast int 49 + spacesSinceLast int 50 + insertEOL bool // insert a comma before next newline 51 + 52 + // public state - ok to modify 53 + ErrorCount int // number of errors encountered 54 + } 55 + 56 + const bom = 0xFEFF // byte order mark, only permitted as very first character 57 + 58 + // Read the next Unicode char into s.ch. 59 + // s.ch < 0 means end-of-file. 60 + func (s *Scanner) next() { 61 + if s.rdOffset < len(s.src) { 62 + s.offset = s.rdOffset 63 + if s.ch == '\n' { 64 + s.lineOffset = s.offset 65 + s.file.AddLine(s.offset) 66 + } 67 + r, w := rune(s.src[s.rdOffset]), 1 68 + switch { 69 + case r == 0: 70 + s.error(s.offset, "illegal character NUL") 71 + case r >= utf8.RuneSelf: 72 + // not ASCII 73 + r, w = utf8.DecodeRune(s.src[s.rdOffset:]) 74 + if r == utf8.RuneError && w == 1 { 75 + s.error(s.offset, "illegal UTF-8 encoding") 76 + } else if r == bom && s.offset > 0 { 77 + s.error(s.offset, "illegal byte order mark") 78 + } 79 + } 80 + s.rdOffset += w 81 + s.ch = r 82 + } else { 83 + s.offset = len(s.src) 84 + if s.ch == '\n' { 85 + s.lineOffset = s.offset 86 + s.file.AddLine(s.offset) 87 + } 88 + s.ch = -1 // eof 89 + } 90 + } 91 + 92 + // A Mode value is a set of flags (or 0). 93 + // They control scanner behavior. 94 + type Mode uint 95 + 96 + // These constants are options to the Init function. 97 + const ( 98 + ScanComments Mode = 1 << iota // return comments as COMMENT tokens 99 + dontInsertCommas // do not automatically insert commas - for testing only 100 + ) 101 + 102 + // Init prepares the scanner s to tokenize the text src by setting the 103 + // scanner at the beginning of src. The scanner uses the file set file 104 + // for position information and it adds line information for each line. 105 + // It is ok to re-use the same file when re-scanning the same file as 106 + // line information which is already present is ignored. Init causes a 107 + // panic if the file size does not match the src size. 108 + // 109 + // Calls to Scan will invoke the error handler err if they encounter a 110 + // syntax error and err is not nil. Also, for each error encountered, 111 + // the Scanner field ErrorCount is incremented by one. The mode parameter 112 + // determines how comments are handled. 113 + // 114 + // Note that Init may call err if there is an error in the first character 115 + // of the file. 116 + func (s *Scanner) Init(file *token.File, src []byte, err errors.Handler, mode Mode) { 117 + // Explicitly initialize all fields since a scanner may be reused. 118 + if file.Size() != len(src) { 119 + panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src))) 120 + } 121 + s.file = file 122 + s.dir, _ = filepath.Split(file.Name()) 123 + s.src = src 124 + s.err = err 125 + s.mode = mode 126 + 127 + s.ch = ' ' 128 + s.offset = 0 129 + s.rdOffset = 0 130 + s.lineOffset = 0 131 + s.insertEOL = false 132 + s.ErrorCount = 0 133 + 134 + s.next() 135 + if s.ch == bom { 136 + s.next() // ignore BOM at file beginning 137 + } 138 + } 139 + 140 + func (s *Scanner) error(offs int, msg string) { 141 + if s.err != nil { 142 + s.err(s.file.Position(s.file.Pos(offs, 0)), msg) 143 + } 144 + s.ErrorCount++ 145 + } 146 + 147 + var prefix = []byte("//line ") 148 + 149 + func (s *Scanner) interpretLineComment(text []byte) { 150 + if bytes.HasPrefix(text, prefix) { 151 + // get filename and line number, if any 152 + if i := bytes.LastIndex(text, []byte{':'}); i > 0 { 153 + if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 { 154 + // valid //line filename:line comment 155 + filename := string(bytes.TrimSpace(text[len(prefix):i])) 156 + if filename != "" { 157 + filename = filepath.Clean(filename) 158 + if !filepath.IsAbs(filename) { 159 + // make filename relative to current directory 160 + filename = filepath.Join(s.dir, filename) 161 + } 162 + } 163 + // update scanner position 164 + s.file.AddLineInfo(s.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line 165 + } 166 + } 167 + } 168 + } 169 + 170 + func (s *Scanner) scanComment() string { 171 + // initial '/' already consumed; s.ch == '/' || s.ch == '*' 172 + offs := s.offset - 1 // position of initial '/' 173 + hasCR := false 174 + 175 + if s.ch == '/' { 176 + //-style comment 177 + s.next() 178 + for s.ch != '\n' && s.ch >= 0 { 179 + if s.ch == '\r' { 180 + hasCR = true 181 + } 182 + s.next() 183 + } 184 + if offs == s.lineOffset { 185 + // comment starts at the beginning of the current line 186 + s.interpretLineComment(s.src[offs:s.offset]) 187 + } 188 + goto exit 189 + } 190 + 191 + /*-style comment */ 192 + s.next() 193 + for s.ch >= 0 { 194 + ch := s.ch 195 + if ch == '\r' { 196 + hasCR = true 197 + } 198 + s.next() 199 + if ch == '*' && s.ch == '/' { 200 + s.next() 201 + goto exit 202 + } 203 + } 204 + 205 + s.error(offs, "comment not terminated") 206 + 207 + exit: 208 + lit := s.src[offs:s.offset] 209 + if hasCR { 210 + // TODO: preserve /r/n 211 + lit = stripCR(lit) 212 + } 213 + 214 + return string(lit) 215 + } 216 + 217 + func (s *Scanner) findLineEnd() bool { 218 + // initial '/' already consumed 219 + 220 + defer func(offs int) { 221 + // reset scanner state to where it was upon calling findLineEnd 222 + s.ch = '/' 223 + s.offset = offs 224 + s.rdOffset = offs + 1 225 + s.next() // consume initial '/' again 226 + }(s.offset - 1) 227 + 228 + // read ahead until a newline, EOF, or non-comment token is found 229 + for s.ch == '/' || s.ch == '*' { 230 + if s.ch == '/' { 231 + //-style comment always contains a newline 232 + return true 233 + } 234 + /*-style comment: look for newline */ 235 + s.next() 236 + for s.ch >= 0 { 237 + ch := s.ch 238 + if ch == '\n' { 239 + return true 240 + } 241 + s.next() 242 + if ch == '*' && s.ch == '/' { 243 + s.next() 244 + break 245 + } 246 + } 247 + s.skipWhitespace(0) // s.insertSemi is set 248 + if s.ch < 0 || s.ch == '\n' { 249 + return true 250 + } 251 + if s.ch != '/' { 252 + // non-comment token 253 + return false 254 + } 255 + s.next() // consume '/' 256 + } 257 + 258 + return false 259 + } 260 + 261 + func isLetter(ch rune) bool { 262 + return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch >= utf8.RuneSelf && unicode.IsLetter(ch) 263 + } 264 + 265 + func isDigit(ch rune) bool { 266 + // TODO(mpvl): Is this correct? 267 + return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch) 268 + } 269 + 270 + func (s *Scanner) scanIdentifier() string { 271 + offs := s.offset 272 + for isLetter(s.ch) || isDigit(s.ch) || s.ch == '_' { 273 + s.next() 274 + } 275 + return string(s.src[offs:s.offset]) 276 + } 277 + 278 + func digitVal(ch rune) int { 279 + switch { 280 + case '0' <= ch && ch <= '9': 281 + return int(ch - '0') 282 + case ch == '_': 283 + return 0 284 + case 'a' <= ch && ch <= 'f': 285 + return int(ch - 'a' + 10) 286 + case 'A' <= ch && ch <= 'F': 287 + return int(ch - 'A' + 10) 288 + } 289 + return 16 // larger than any legal digit val 290 + } 291 + 292 + func (s *Scanner) scanMantissa(base int) { 293 + var last rune 294 + for digitVal(s.ch) < base { 295 + last = s.ch 296 + s.next() 297 + } 298 + if last == '_' { 299 + s.error(s.offset-1, "illegal '_' in number") 300 + } 301 + } 302 + 303 + func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) { 304 + // digitVal(s.ch) < 10 305 + offs := s.offset 306 + tok := token.INT 307 + 308 + if seenDecimalPoint { 309 + offs-- 310 + tok = token.FLOAT 311 + s.scanMantissa(10) 312 + goto exponent 313 + } 314 + 315 + if s.ch == '0' { 316 + // int or float 317 + offs := s.offset 318 + s.next() 319 + if s.ch == 'x' || s.ch == 'X' { 320 + // hexadecimal int 321 + s.next() 322 + s.scanMantissa(16) 323 + if s.offset-offs <= 2 { 324 + // only scanned "0x" or "0X" 325 + s.error(offs, "illegal hexadecimal number") 326 + } 327 + } else if s.ch == 'b' || s.ch == 'B' { 328 + // binary int 329 + s.next() 330 + s.scanMantissa(2) 331 + if s.offset-offs <= 2 { 332 + // only scanned "0b" or "0B" 333 + s.error(offs, "illegal binary number") 334 + } 335 + } else { 336 + // octal int or float 337 + seenDecimalDigit := false 338 + s.scanMantissa(8) 339 + if s.ch == '8' || s.ch == '9' { 340 + // illegal octal int or float 341 + seenDecimalDigit = true 342 + s.scanMantissa(10) 343 + } 344 + // TODO: disallow complex. 345 + if s.ch == '.' || s.ch == 'e' { 346 + goto fraction 347 + } 348 + // octal int 349 + if seenDecimalDigit { 350 + s.error(offs, "illegal octal number") 351 + } 352 + } 353 + goto exit 354 + } 355 + 356 + // decimal int or float 357 + s.scanMantissa(10) 358 + 359 + // TODO: allow 3h4s, etc. 360 + // switch s.ch { 361 + // case 'h', 'm', 's', "µ"[0], 'u', 'n': 362 + // } 363 + 364 + fraction: 365 + if s.ch == '.' { 366 + if p := s.offset + 1; p < len(s.src) && s.src[p] == '.' { 367 + // interpret dot as part of a range. 368 + goto exit 369 + } 370 + tok = token.FLOAT 371 + s.next() 372 + s.scanMantissa(10) 373 + } 374 + 375 + exponent: 376 + switch s.ch { 377 + case 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y': 378 + tok = token.INT // TODO: Or should we allow this to be a float? 379 + s.next() 380 + if s.ch == 'i' { 381 + s.next() 382 + } 383 + goto exit 384 + } 385 + 386 + // TODO: allow 'E' for exponent? Could be used for Exa 387 + if s.ch == 'e' { // || s.ch == 'E' { 388 + tok = token.FLOAT 389 + s.next() 390 + if s.ch == '-' || s.ch == '+' { 391 + s.next() 392 + } 393 + s.scanMantissa(10) 394 + } 395 + 396 + exit: 397 + return tok, string(s.src[offs:s.offset]) 398 + } 399 + 400 + // scanEscape parses an escape sequence where rune is the accepted 401 + // escaped quote. In case of a syntax error, it stops at the offending 402 + // character (without consuming it) and returns false. Otherwise 403 + // it returns true. 404 + func (s *Scanner) scanEscape(quote rune) (ok, template bool) { 405 + offs := s.offset 406 + 407 + var n int 408 + var base, max uint32 409 + switch s.ch { 410 + // TODO: remove 411 + case '(': 412 + return true, true 413 + case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 414 + s.next() 415 + return true, false 416 + case '0', '1', '2', '3', '4', '5', '6', '7': 417 + n, base, max = 3, 8, 255 418 + case 'x': 419 + s.next() 420 + n, base, max = 2, 16, 255 421 + case 'u': 422 + s.next() 423 + n, base, max = 4, 16, unicode.MaxRune 424 + case 'U': 425 + s.next() 426 + n, base, max = 8, 16, unicode.MaxRune 427 + default: 428 + msg := "unknown escape sequence" 429 + if s.ch < 0 { 430 + msg = "escape sequence not terminated" 431 + } 432 + s.error(offs, msg) 433 + return false, false 434 + } 435 + 436 + var x uint32 437 + for n > 0 { 438 + d := uint32(digitVal(s.ch)) 439 + if d >= base { 440 + msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch) 441 + if s.ch < 0 { 442 + msg = "escape sequence not terminated" 443 + } 444 + s.error(s.offset, msg) 445 + return false, false 446 + } 447 + x = x*base + d 448 + s.next() 449 + n-- 450 + } 451 + 452 + if x > max || 0xD800 <= x && x < 0xE000 { 453 + s.error(offs, "escape sequence is invalid Unicode code point") 454 + return false, false 455 + } 456 + 457 + return true, false 458 + } 459 + 460 + func (s *Scanner) scanString(quote rune, offset, numQuotes int) (token.Token, string) { 461 + // ", """, ', or ''' opening already consumed 462 + offs := s.offset - offset 463 + 464 + tok := token.STRING 465 + 466 + hasCR := false 467 + extra := 0 468 + for { 469 + ch, n := s.consumeQuotes(quote, numQuotes) 470 + if n == numQuotes { 471 + break 472 + } 473 + if (numQuotes != 3 && ch == '\n') || ch < 0 { 474 + s.error(offs, "string literal not terminated") 475 + lit := s.src[offs:s.offset] 476 + if hasCR { 477 + lit = stripCR(lit) 478 + } 479 + return tok, string(lit) 480 + } 481 + if ch == '\r' && numQuotes == 3 { 482 + hasCR = true 483 + } 484 + s.next() 485 + if ch == '\\' { 486 + if s.ch == '(' { 487 + tok = token.INTERPOLATION 488 + extra = 1 489 + break 490 + } 491 + s.scanEscape(quote) 492 + } 493 + } 494 + lit := s.src[offs : s.offset+extra] 495 + if hasCR { 496 + lit = stripCR(lit) 497 + } 498 + return tok, string(lit) 499 + } 500 + 501 + func (s *Scanner) consumeQuotes(quote rune, max int) (next rune, n int) { 502 + for ; n < max; n++ { 503 + if s.ch != quote { 504 + return s.ch, n 505 + } 506 + s.next() 507 + } 508 + return s.ch, n 509 + } 510 + 511 + func stripCR(b []byte) []byte { 512 + c := make([]byte, len(b)) 513 + i := 0 514 + for _, ch := range b { 515 + if ch != '\r' { 516 + c[i] = ch 517 + i++ 518 + } 519 + } 520 + return c[:i] 521 + } 522 + 523 + func (s *Scanner) scanRawString() string { 524 + // '`' opening already consumed 525 + offs := s.offset - 1 526 + 527 + hasCR := false 528 + for { 529 + ch := s.ch 530 + if ch < 0 { 531 + s.error(offs, "raw string literal not terminated") 532 + break 533 + } 534 + s.next() 535 + if ch == '`' { 536 + break 537 + } 538 + if ch == '\r' { 539 + hasCR = true 540 + } 541 + } 542 + 543 + lit := s.src[offs:s.offset] 544 + if hasCR { 545 + lit = stripCR(lit) 546 + } 547 + 548 + return string(lit) 549 + } 550 + 551 + func (s *Scanner) skipWhitespace(inc int) { 552 + for { 553 + switch s.ch { 554 + case ' ', '\t': 555 + s.spacesSinceLast += inc 556 + case '\n': 557 + s.linesSinceLast += inc 558 + if s.insertEOL { 559 + return 560 + } 561 + case '\r': 562 + default: 563 + return 564 + } 565 + s.next() 566 + } 567 + } 568 + 569 + // Helper functions for scanning multi-byte tokens such as >> += >>= . 570 + // Different routines recognize different length tok_i based on matches 571 + // of ch_i. If a token ends in '=', the result is tok1 or tok3 572 + // respectively. Otherwise, the result is tok0 if there was no other 573 + // matching character, or tok2 if the matching character was ch2. 574 + 575 + func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token { 576 + if s.ch == '=' { 577 + s.next() 578 + return tok1 579 + } 580 + return tok0 581 + } 582 + 583 + // ResumeInterpolation resumes scanning of a string interpolation. 584 + func (s *Scanner) ResumeInterpolation(quote rune, numQuotes int) string { 585 + _, str := s.scanString(quote, 1, numQuotes) 586 + return str 587 + } 588 + 589 + // Scan scans the next token and returns the token position, the token, 590 + // and its literal string if applicable. The source end is indicated by 591 + // EOF. 592 + // 593 + // If the returned token is a literal (IDENT, INT, FLOAT, 594 + // IMAG, CHAR, STRING) or COMMENT, the literal string 595 + // has the corresponding value. 596 + // 597 + // If the returned token is a keyword, the literal string is the keyword. 598 + // 599 + // If the returned token is Comma, the corresponding 600 + // literal string is "," if the comma was present in the source, 601 + // and "\n" if the semicolon was inserted because of a newline or 602 + // at EOF. 603 + // 604 + // If the returned token is ILLEGAL, the literal string is the 605 + // offending character. 606 + // 607 + // In all other cases, Scan returns an empty literal string. 608 + // 609 + // For more tolerant parsing, Scan will return a valid token if 610 + // possible even if a syntax error was encountered. Thus, even 611 + // if the resulting token sequence contains no illegal tokens, 612 + // a client may not assume that no error occurred. Instead it 613 + // must check the scanner's ErrorCount or the number of calls 614 + // of the error handler, if there was one installed. 615 + // 616 + // Scan adds line information to the file added to the file 617 + // set with Init. Token positions are relative to that file 618 + // and thus relative to the file set. 619 + func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) { 620 + scanAgain: 621 + s.skipWhitespace(1) 622 + 623 + var rel token.RelPos 624 + switch { 625 + case s.linesSinceLast > 1: 626 + rel = token.NewSection 627 + case s.linesSinceLast == 1: 628 + rel = token.Newline 629 + case s.spacesSinceLast > 0: 630 + rel = token.Blank 631 + default: 632 + rel = token.NoSpace 633 + } 634 + // current token start 635 + offset := s.offset 636 + pos = s.file.Pos(offset, rel) 637 + 638 + // determine token value 639 + insertEOL := false 640 + switch ch := s.ch; { 641 + // case ch == '$': 642 + // lit = string(rune(ch)) 643 + // s.next() 644 + // fallthrough 645 + case isLetter(ch): 646 + lit = s.scanIdentifier() 647 + if len(lit) > 1 { 648 + // keywords are longer than one letter - avoid lookup otherwise 649 + tok = token.Lookup(lit) 650 + switch tok { 651 + case token.IDENT, token.TRUE, token.FALSE, token.NULL, token.BOTTOM: 652 + insertEOL = true 653 + } 654 + } else { 655 + insertEOL = true 656 + tok = token.IDENT 657 + } 658 + case '0' <= ch && ch <= '9': 659 + insertEOL = true 660 + tok, lit = s.scanNumber(false) 661 + default: 662 + s.next() // always make progress 663 + switch ch { 664 + case -1: 665 + if s.insertEOL { 666 + s.insertEOL = false // EOF consumed 667 + return s.file.Pos(offset, token.Elided), token.COMMA, "\n" 668 + } 669 + tok = token.EOF 670 + case '_': 671 + if s.ch == '|' { 672 + // Unconditionally require this to be followed by another 673 + // underscore to avoid needing an extra lookahead. 674 + // Note that `_|x` is always equal to x. 675 + s.next() 676 + if s.ch != '_' { 677 + s.error(s.file.Offset(pos), "illegal token '_|'; expected '_'") 678 + insertEOL = s.insertEOL // preserve insertComma info 679 + tok = token.ILLEGAL 680 + lit = "_|" 681 + break 682 + } 683 + s.next() 684 + tok = token.BOTTOM 685 + lit = "_|_" 686 + } else { 687 + tok = token.IDENT 688 + lit = "_" + s.scanIdentifier() 689 + } 690 + insertEOL = true 691 + case '\n': 692 + // we only reach here if s.insertSemi was 693 + // set in the first place and exited early 694 + // from s.skipWhitespace() 695 + s.insertEOL = false // newline consumed 696 + return s.file.Pos(offset, token.Elided), token.COMMA, "\n" 697 + case '"', '\'': 698 + insertEOL = true 699 + switch _, n := s.consumeQuotes(ch, 2); n { 700 + case 1: 701 + if ch == '"' { 702 + tok, lit = token.STRING, `""` 703 + } else { 704 + tok, lit = token.STRING, `''` 705 + } 706 + default: 707 + tok, lit = s.scanString(ch, n+1, n+1) 708 + } 709 + case '`': 710 + insertEOL = true 711 + tok = token.STRING 712 + lit = s.scanRawString() 713 + case ':': 714 + tok = token.COLON 715 + case ';': 716 + tok = token.SEMICOLON 717 + insertEOL = true 718 + case '.': 719 + if '0' <= s.ch && s.ch <= '9' { 720 + insertEOL = true 721 + tok, lit = s.scanNumber(true) 722 + } else if s.ch == '.' { 723 + s.next() 724 + if s.ch == '.' { 725 + s.next() 726 + tok = token.ELLIPSIS 727 + } else { 728 + tok = token.RANGE 729 + } 730 + } else { 731 + tok = token.PERIOD 732 + } 733 + case ',': 734 + tok = token.COMMA 735 + lit = "," 736 + case '(': 737 + tok = token.LPAREN 738 + case ')': 739 + insertEOL = true 740 + tok = token.RPAREN 741 + case '[': 742 + tok = token.LBRACK 743 + case ']': 744 + insertEOL = true 745 + tok = token.RBRACK 746 + case '{': 747 + tok = token.LBRACE 748 + case '}': 749 + insertEOL = true 750 + tok = token.RBRACE 751 + case '+': 752 + tok = token.ADD // Consider ++ for list concatenate. 753 + case '-': 754 + if s.ch == '>' { 755 + s.next() 756 + tok = token.LAMBDA 757 + } else { 758 + tok = token.SUB 759 + } 760 + case '*': 761 + tok = token.MUL 762 + case '/': 763 + if s.ch == '/' || s.ch == '*' { 764 + // comment 765 + if s.insertEOL && s.findLineEnd() { 766 + // reset position to the beginning of the comment 767 + s.ch = '/' 768 + s.offset = s.file.Offset(pos) 769 + s.rdOffset = s.offset + 1 770 + s.insertEOL = false // newline consumed 771 + return s.file.Pos(offset, token.Elided), token.COMMA, "\n" 772 + } 773 + comment := s.scanComment() 774 + if s.mode&ScanComments == 0 { 775 + // skip comment 776 + s.insertEOL = false // newline consumed 777 + goto scanAgain 778 + } 779 + tok = token.COMMENT 780 + lit = comment 781 + } else { 782 + tok = token.QUO 783 + } 784 + case '%': 785 + tok = token.REM 786 + case '<': 787 + if s.ch == '-' { 788 + s.next() 789 + tok = token.ARROW 790 + } else { 791 + tok = s.switch2(token.LSS, token.LEQ) 792 + } 793 + case '>': 794 + tok = s.switch2(token.GTR, token.GEQ) 795 + case '=': 796 + tok = s.switch2(token.BIND, token.EQL) 797 + case '!': 798 + tok = s.switch2(token.NOT, token.NEQ) 799 + case '&': 800 + switch s.ch { 801 + case '&': 802 + s.next() 803 + tok = token.LAND 804 + default: 805 + tok = token.UNIFY 806 + } 807 + case '|': 808 + if s.ch == '|' { 809 + s.next() 810 + tok = token.LOR 811 + } else { 812 + tok = token.DISJUNCTION 813 + } 814 + default: 815 + // next reports unexpected BOMs - don't repeat 816 + if ch != bom { 817 + s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch)) 818 + } 819 + insertEOL = s.insertEOL // preserve insertSemi info 820 + tok = token.ILLEGAL 821 + lit = string(ch) 822 + } 823 + } 824 + if s.mode&dontInsertCommas == 0 { 825 + s.insertEOL = insertEOL 826 + } 827 + 828 + s.linesSinceLast = 0 829 + s.spacesSinceLast = 0 830 + return 831 + }
+1189
cue/scanner/scanner_test.go
··· 1 + // Copyright 2018 The CUE Authors 2 + // 3 + // Licensed under the Apache License, Version 2.0 (the "License"); 4 + // you may not use this file except in compliance with the License. 5 + // You may obtain a copy of the License at 6 + // 7 + // http://www.apache.org/licenses/LICENSE-2.0 8 + // 9 + // Unless required by applicable law or agreed to in writing, software 10 + // distributed under the License is distributed on an "AS IS" BASIS, 11 + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 + // See the License for the specific language governing permissions and 13 + // limitations under the License. 14 + 15 + package scanner 16 + 17 + import ( 18 + "fmt" 19 + "io/ioutil" 20 + "os" 21 + "path/filepath" 22 + "reflect" 23 + "runtime" 24 + "strings" 25 + "testing" 26 + 27 + "cuelang.org/go/cue/errors" 28 + "cuelang.org/go/cue/token" 29 + "github.com/google/go-cmp/cmp" 30 + ) 31 + 32 + var fset = token.NewFileSet() 33 + 34 + const /* class */ ( 35 + special = iota 36 + literal 37 + operator 38 + keyword 39 + ) 40 + 41 + func tokenclass(tok token.Token) int { 42 + switch { 43 + case tok.IsLiteral(): 44 + return literal 45 + case tok.IsOperator(): 46 + return operator 47 + case tok.IsKeyword(): 48 + return keyword 49 + } 50 + return special 51 + } 52 + 53 + type elt struct { 54 + tok token.Token 55 + lit string 56 + class int 57 + } 58 + 59 + var testTokens = [...]elt{ 60 + // Special tokens 61 + {token.COMMENT, "/* a comment */", special}, 62 + {token.COMMENT, "// a comment \n", special}, 63 + {token.COMMENT, "/*\r*/", special}, 64 + {token.COMMENT, "//\r\n", special}, 65 + 66 + // Identifiers and basic type literals 67 + {token.BOTTOM, "_|_", literal}, 68 + {token.BOTTOM, "_|_", literal}, 69 + 70 + {token.IDENT, "foobar", literal}, 71 + {token.IDENT, "a۰۱۸", literal}, 72 + {token.IDENT, "foo६४", literal}, 73 + {token.IDENT, "bar9876", literal}, 74 + {token.IDENT, "ŝ", literal}, 75 + {token.IDENT, "ŝfoo", literal}, 76 + {token.INT, "0", literal}, 77 + {token.INT, "1", literal}, 78 + {token.INT, "123456789012345678890", literal}, 79 + {token.INT, "12345_67890_12345_6788_90", literal}, 80 + {token.INT, "1234567M", literal}, 81 + {token.INT, "1234567Mi", literal}, 82 + {token.INT, "01234567", literal}, 83 + {token.INT, ".3Mi", literal}, 84 + {token.INT, "3.3Mi", literal}, 85 + {token.INT, "0xcafebabe", literal}, 86 + {token.INT, "0b1100_1001", literal}, 87 + {token.FLOAT, "0.", literal}, 88 + {token.FLOAT, ".0", literal}, 89 + {token.FLOAT, "3.14159265", literal}, 90 + {token.FLOAT, "1e0", literal}, 91 + {token.FLOAT, "1e+100", literal}, 92 + {token.FLOAT, "1e-100", literal}, 93 + {token.FLOAT, "2.71828e-1000", literal}, 94 + {token.STRING, "`aa\n\n`", literal}, 95 + {token.STRING, "'a'", literal}, 96 + {token.STRING, "'\\000'", literal}, 97 + {token.STRING, "'\\xFF'", literal}, 98 + {token.STRING, "'\\uff16'", literal}, 99 + {token.STRING, "'\\U0000ff16'", literal}, 100 + {token.STRING, "'foobar'", literal}, 101 + {token.STRING, "`" + `foo 102 + bar` + 103 + "`", 104 + literal, 105 + }, 106 + {token.STRING, "`foobar`", literal}, 107 + {token.STRING, "`\r`", literal}, 108 + {token.STRING, "`foo\r\nbar`", literal}, 109 + {token.STRING, "'" + `\r` + "'", literal}, 110 + {token.STRING, "'foo" + `\r\n` + "bar'", literal}, 111 + {token.STRING, `"foobar"`, literal}, 112 + {token.STRING, `"""\n foobar\n """`, literal}, 113 + 114 + // Operators and delimiters 115 + {token.ADD, "+", operator}, 116 + {token.SUB, "-", operator}, 117 + {token.MUL, "*", operator}, 118 + {token.QUO, "/", operator}, 119 + {token.REM, "%", operator}, 120 + 121 + {token.UNIFY, "&", operator}, 122 + {token.DISJUNCTION, "|", operator}, 123 + 124 + {token.LAND, "&&", operator}, 125 + {token.LOR, "||", operator}, 126 + {token.LAMBDA, "->", operator}, 127 + 128 + {token.EQL, "==", operator}, 129 + {token.LSS, "<", operator}, 130 + {token.GTR, ">", operator}, 131 + {token.BIND, "=", operator}, 132 + {token.NOT, "!", operator}, 133 + 134 + {token.NEQ, "!=", operator}, 135 + {token.LEQ, "<=", operator}, 136 + {token.GEQ, ">=", operator}, 137 + {token.RANGE, "..", operator}, 138 + {token.ELLIPSIS, "...", operator}, 139 + 140 + {token.LPAREN, "(", operator}, 141 + {token.LBRACK, "[", operator}, 142 + {token.LBRACE, "{", operator}, 143 + {token.COMMA, ",", operator}, 144 + {token.PERIOD, ".", operator}, 145 + 146 + {token.RPAREN, ")", operator}, 147 + {token.RBRACK, "]", operator}, 148 + {token.RBRACE, "}", operator}, 149 + {token.COLON, ":", operator}, 150 + 151 + // Keywords 152 + {token.TRUE, "true", keyword}, 153 + {token.FALSE, "false", keyword}, 154 + {token.NULL, "null", keyword}, 155 + 156 + {token.FOR, "for", keyword}, 157 + {token.IF, "if", keyword}, 158 + {token.IN, "in", keyword}, 159 + } 160 + 161 + const whitespace = " \t \n\n\n" // to separate tokens 162 + 163 + var source = func() []byte { 164 + var src []byte 165 + for _, t := range testTokens { 166 + src = append(src, t.lit...) 167 + src = append(src, whitespace...) 168 + } 169 + return src 170 + }() 171 + 172 + func newlineCount(s string) int { 173 + n := 0 174 + for i := 0; i < len(s); i++ { 175 + if s[i] == '\n' { 176 + n++ 177 + } 178 + } 179 + return n 180 + } 181 + 182 + func checkPosScan(t *testing.T, lit string, p token.Pos, expected token.Position) { 183 + pos := fset.Position(p) 184 + if pos.Filename != expected.Filename { 185 + t.Errorf("bad filename for %q: got %s, expected %s", lit, pos.Filename, expected.Filename) 186 + } 187 + if pos.Offset != expected.Offset { 188 + t.Errorf("bad position for %q: got %d, expected %d", lit, pos.Offset, expected.Offset) 189 + } 190 + if pos.Line != expected.Line { 191 + t.Errorf("bad line for %q: got %d, expected %d", lit, pos.Line, expected.Line) 192 + } 193 + if pos.Column != expected.Column { 194 + t.Errorf("bad column for %q: got %d, expected %d", lit, pos.Column, expected.Column) 195 + } 196 + } 197 + 198 + // Verify that calling Scan() provides the correct results. 199 + func TestScan(t *testing.T) { 200 + whitespace_linecount := newlineCount(whitespace) 201 + 202 + // error handler 203 + eh := func(_ token.Position, msg string) { 204 + t.Errorf("error handler called (msg = %s)", msg) 205 + } 206 + 207 + // verify scan 208 + var s Scanner 209 + s.Init(fset.AddFile("", fset.Base(), len(source)), source, eh, ScanComments|dontInsertCommas) 210 + 211 + // set up expected position 212 + epos := token.Position{ 213 + Filename: "", 214 + Offset: 0, 215 + Line: 1, 216 + Column: 1, 217 + } 218 + 219 + index := 0 220 + for { 221 + pos, tok, lit := s.Scan() 222 + 223 + // check position 224 + if tok == token.EOF { 225 + // correction for EOF 226 + epos.Line = newlineCount(string(source)) 227 + epos.Column = 2 228 + } 229 + checkPosScan(t, lit, pos, epos) 230 + 231 + // check token 232 + e := elt{token.EOF, "", special} 233 + if index < len(testTokens) { 234 + e = testTokens[index] 235 + index++ 236 + } 237 + if tok != e.tok { 238 + t.Errorf("bad token for %q: got %s, expected %s", lit, tok, e.tok) 239 + } 240 + 241 + // check token class 242 + if tokenclass(tok) != e.class { 243 + t.Errorf("bad class for %q: got %d, expected %d", lit, tokenclass(tok), e.class) 244 + } 245 + 246 + // check literal 247 + elit := "" 248 + switch e.tok { 249 + case token.COMMENT: 250 + // no CRs in comments 251 + elit = string(stripCR([]byte(e.lit))) 252 + //-style comment literal doesn't contain newline 253 + if elit[1] == '/' { 254 + elit = elit[0 : len(elit)-1] 255 + } 256 + case token.IDENT: 257 + elit = e.lit 258 + case token.COMMA: 259 + elit = "," 260 + default: 261 + if e.tok.IsLiteral() { 262 + // no CRs in raw string literals 263 + elit = e.lit 264 + if elit[0] == '`' { 265 + elit = string(stripCR([]byte(elit))) 266 + } 267 + } else if e.tok.IsKeyword() { 268 + elit = e.lit 269 + } 270 + } 271 + if lit != elit { 272 + t.Errorf("bad literal for %q: got %q, expected %q", lit, lit, elit) 273 + } 274 + 275 + if tok == token.EOF { 276 + break 277 + } 278 + 279 + // update position 280 + epos.Offset += len(e.lit) + len(whitespace) 281 + epos.Line += newlineCount(e.lit) + whitespace_linecount 282 + 283 + } 284 + 285 + if s.ErrorCount != 0 { 286 + t.Errorf("found %d errors", s.ErrorCount) 287 + } 288 + } 289 + 290 + func checkComma(t *testing.T, line string, mode Mode) { 291 + var S Scanner 292 + file := fset.AddFile("TestCommas", fset.Base(), len(line)) 293 + S.Init(file, []byte(line), nil, mode) 294 + pos, tok, lit := S.Scan() 295 + for tok != token.EOF { 296 + if tok == token.ILLEGAL { 297 + // the illegal token literal indicates what 298 + // kind of semicolon literal to expect 299 + commaLit := "\n" 300 + if lit[0] == '#' { 301 + commaLit = "," 302 + } 303 + // next token must be a comma 304 + commaPos := file.Position(pos) 305 + commaPos.Offset++ 306 + commaPos.Column++ 307 + pos, tok, lit = S.Scan() 308 + if tok == token.COMMA { 309 + if lit != commaLit { 310 + t.Errorf(`bad literal for %q: got %q (%q), expected %q`, line, lit, tok, commaLit) 311 + } 312 + checkPosScan(t, line, pos, commaPos) 313 + } else { 314 + t.Errorf("bad token for %q: got %s, expected ','", line, tok) 315 + } 316 + } else if tok == token.COMMA { 317 + t.Errorf("bad token for %q: got ',', expected no ','", line) 318 + } 319 + pos, tok, lit = S.Scan() 320 + } 321 + } 322 + 323 + var lines = []string{ 324 + // # indicates a comma present in the source 325 + // ? indicates an automatically inserted comma 326 + "", 327 + "\ufeff#,", // first BOM is ignored 328 + "#,", 329 + "foo?\n", 330 + "_foo?\n", 331 + "123?\n", 332 + "1.2?\n", 333 + "'x'?\n", 334 + "_|_?\n", 335 + "_|_?\n", 336 + `"x"` + "?\n", 337 + "`x`?\n", 338 + `""" 339 + foo 340 + """` + "?\n", 341 + // `""" 342 + // foo \(bar) 343 + // """` + "?\n", 344 + `''' 345 + foo 346 + '''` + "?\n", 347 + 348 + "+\n", 349 + "-\n", 350 + "*\n", 351 + "/\n", 352 + "%\n", 353 + 354 + "&\n", 355 + // "&?\n", 356 + "|\n", 357 + 358 + "&&\n", 359 + "||\n", 360 + "<-\n", 361 + "->\n", 362 + 363 + "==\n", 364 + "<\n", 365 + ">\n", 366 + "=\n", 367 + "!\n", 368 + 369 + "!=\n", 370 + "<=\n", 371 + ">=\n", 372 + ":=\n", 373 + "...\n", 374 + 375 + "(\n", 376 + "[\n", 377 + "[[\n", 378 + "{\n", 379 + "{{\n", 380 + "#,\n", 381 + ".\n", 382 + 383 + ")?\n", 384 + "]?\n", 385 + "]]?\n", 386 + "}?\n", 387 + "}}?\n", 388 + ":\n", 389 + ";?\n", 390 + 391 + "true?\n", 392 + "false?\n", 393 + "null?\n", 394 + 395 + "foo?//comment\n", 396 + "foo?//comment", 397 + "foo?/*comment*/\n", 398 + "foo?/*\n*/", 399 + "foo?/*comment*/ \n", 400 + "foo?/*\n*/ ", 401 + 402 + "foo ?// comment\n", 403 + "foo ?// comment", 404 + "foo ?/*comment*/\n", 405 + "foo ?/*\n*/", 406 + "foo ?/* */ /* \n */ bar?/**/\n", 407 + "foo ?/*0*/ /*1*/ /*2*/\n", 408 + 409 + "foo ?/*comment*/ \n", 410 + "foo ?/*0*/ /*1*/ /*2*/ \n", 411 + "foo ?/**/ /*-------------*/ /*----\n*/bar ?/* \n*/baa?\n", 412 + "foo ?/* an EOF terminates a line */", 413 + "foo ?/* an EOF terminates a line */ /*", 414 + "foo ?/* an EOF terminates a line */ //", 415 + 416 + // "package main?\n\nfunc main() {\n\tif {\n\t\treturn /* */ }?\n}?\n", 417 + // "package main?", 418 + } 419 + 420 + func TestCommas(t *testing.T) { 421 + for _, line := range lines { 422 + checkComma(t, line, 0) 423 + checkComma(t, line, ScanComments) 424 + 425 + // if the input ended in newlines, the input must tokenize the 426 + // same with or without those newlines 427 + for i := len(line) - 1; i >= 0 && line[i] == '\n'; i-- { 428 + checkComma(t, line[0:i], 0) 429 + checkComma(t, line[0:i], ScanComments) 430 + } 431 + } 432 + } 433 + 434 + func TestRelative(t *testing.T) { 435 + test := ` 436 + package foo 437 + 438 + // comment 439 + a: /* a */1 440 + b : 5 /* 441 + line one 442 + line two 443 + */ 444 + c: "dfs" 445 + ` 446 + want := []string{ 447 + `newline IDENT package`, 448 + `blank IDENT foo`, 449 + "elided , \n", 450 + `section COMMENT // comment`, 451 + `newline IDENT a`, 452 + `nospace : `, 453 + `blank COMMENT /* a */`, 454 + `nospace INT 1`, 455 + "elided , \n", 456 + `newline IDENT b`, 457 + `blank : `, 458 + `blank INT 5`, 459 + "elided , \n", 460 + "blank COMMENT /*\n\t line one\n\t line two\n\t*/", 461 + `newline IDENT c`, 462 + `nospace : `, 463 + `blank STRING "dfs"`, 464 + "elided , \n", 465 + } 466 + var S Scanner 467 + f := fset.AddFile("TestCommas", fset.Base(), len(test)) 468 + S.Init(f, []byte(test), nil, ScanComments) 469 + pos, tok, lit := S.Scan() 470 + got := []string{} 471 + for tok != token.EOF { 472 + got = append(got, fmt.Sprintf("%-7s %-8s %s", pos.RelPos(), tok, lit)) 473 + pos, tok, lit = S.Scan() 474 + } 475 + if !cmp.Equal(got, want) { 476 + t.Error(cmp.Diff(got, want)) 477 + } 478 + } 479 + 480 + type segment struct { 481 + srcline string // a line of source text 482 + filename string // filename for current token 483 + line int // line number for current token 484 + } 485 + 486 + var segments = []segment{ 487 + // exactly one token per line since the test consumes one token per segment 488 + {" line1", filepath.Join("dir", "TestLineComments"), 1}, 489 + {"\nline2", filepath.Join("dir", "TestLineComments"), 2}, 490 + {"\nline3 //line File1.go:100", filepath.Join("dir", "TestLineComments"), 3}, // bad line comment, ignored 491 + {"\nline4", filepath.Join("dir", "TestLineComments"), 4}, 492 + {"\n//line File1.go:100\n line100", filepath.Join("dir", "File1.go"), 100}, 493 + {"\n//line \t :42\n line1", "", 42}, 494 + {"\n//line File2.go:200\n line200", filepath.Join("dir", "File2.go"), 200}, 495 + {"\n//line foo\t:42\n line42", filepath.Join("dir", "foo"), 42}, 496 + {"\n //line foo:42\n line44", filepath.Join("dir", "foo"), 44}, // bad line comment, ignored 497 + {"\n//line foo 42\n line46", filepath.Join("dir", "foo"), 46}, // bad line comment, ignored 498 + {"\n//line foo:42 extra text\n line48", filepath.Join("dir", "foo"), 48}, // bad line comment, ignored 499 + {"\n//line ./foo:42\n line42", filepath.Join("dir", "foo"), 42}, 500 + {"\n//line a/b/c/File1.go:100\n line100", filepath.Join("dir", "a", "b", "c", "File1.go"), 100}, 501 + } 502 + 503 + var unixsegments = []segment{ 504 + {"\n//line /bar:42\n line42", "/bar", 42}, 505 + } 506 + 507 + var winsegments = []segment{ 508 + {"\n//line c:\\bar:42\n line42", "c:\\bar", 42}, 509 + {"\n//line c:\\dir\\File1.go:100\n line100", "c:\\dir\\File1.go", 100}, 510 + } 511 + 512 + // Verify that comments of the form "//line filename:line" are interpreted correctly. 513 + func TestLineComments(t *testing.T) { 514 + segs := segments 515 + if runtime.GOOS == "windows" { 516 + segs = append(segs, winsegments...) 517 + } else { 518 + segs = append(segs, unixsegments...) 519 + } 520 + 521 + // make source 522 + var src string 523 + for _, e := range segs { 524 + src += e.srcline 525 + } 526 + 527 + // verify scan 528 + var S Scanner 529 + f := fset.AddFile(filepath.Join("dir", "TestLineComments"), fset.Base(), len(src)) 530 + S.Init(f, []byte(src), nil, dontInsertCommas) 531 + for _, s := range segs { 532 + p, _, lit := S.Scan() 533 + pos := f.Position(p) 534 + checkPosScan(t, lit, p, token.Position{ 535 + Filename: s.filename, 536 + Offset: pos.Offset, 537 + Line: s.line, 538 + Column: pos.Column, 539 + }) 540 + } 541 + 542 + if S.ErrorCount != 0 { 543 + t.Errorf("found %d errors", S.ErrorCount) 544 + } 545 + } 546 + 547 + // Verify that initializing the same scanner more than once works correctly. 548 + func TestInit(t *testing.T) { 549 + var s Scanner 550 + 551 + // 1st init 552 + src1 := "false true { }" 553 + f1 := fset.AddFile("src1", fset.Base(), len(src1)) 554 + s.Init(f1, []byte(src1), nil, dontInsertCommas) 555 + if f1.Size() != len(src1) { 556 + t.Errorf("bad file size: got %d, expected %d", f1.Size(), len(src1)) 557 + } 558 + s.Scan() // false 559 + s.Scan() // true 560 + _, tok, _ := s.Scan() // { 561 + if tok != token.LBRACE { 562 + t.Errorf("bad token: got %s, expected %s", tok, token.LBRACE) 563 + } 564 + 565 + // 2nd init 566 + src2 := "null true { ]" 567 + f2 := fset.AddFile("src2", fset.Base(), len(src2)) 568 + s.Init(f2, []byte(src2), nil, dontInsertCommas) 569 + if f2.Size() != len(src2) { 570 + t.Errorf("bad file size: got %d, expected %d", f2.Size(), len(src2)) 571 + } 572 + _, tok, _ = s.Scan() // go 573 + if tok != token.NULL { 574 + t.Errorf("bad token: got %s, expected %s", tok, token.NULL) 575 + } 576 + 577 + if s.ErrorCount != 0 { 578 + t.Errorf("found %d errors", s.ErrorCount) 579 + } 580 + } 581 + 582 + func TestScanTemplate(t *testing.T) { 583 + // error handler 584 + eh := func(pos token.Position, msg string) { 585 + t.Errorf("error handler called (pos = %v, msg = %s)", pos, msg) 586 + } 587 + trim := func(s string) string { return strings.Trim(s, `"\\()`) } 588 + 589 + sources := []string{ 590 + `"first\(first)\\second\(second)"`, 591 + `"level\( ["foo", "level", level ][2] )end\( end )"`, 592 + `"level\( { "foo": 1, "bar": level } )end\(end)"`, 593 + } 594 + for i, src := range sources { 595 + name := fmt.Sprintf("tsrc%d", i) 596 + t.Run(name, func(t *testing.T) { 597 + f := fset.AddFile(name, fset.Base(), len(src)) 598 + 599 + // verify scan 600 + var s Scanner 601 + s.Init(f, []byte(src), eh, ScanComments) 602 + 603 + count := 0 604 + var lit, str string 605 + for tok := token.ILLEGAL; tok != token.EOF; { 606 + switch tok { 607 + case token.LPAREN: 608 + count++ 609 + case token.RPAREN: 610 + if count--; count == 0 { 611 + str = trim(s.ResumeInterpolation('"', 1)) 612 + } 613 + case token.INTERPOLATION: 614 + str = trim(lit) 615 + case token.IDENT: 616 + if lit != str { 617 + t.Errorf("str: got %v; want %v", lit, str) 618 + } 619 + } 620 + _, tok, lit = s.Scan() 621 + } 622 + }) 623 + } 624 + } 625 + 626 + func TestStdErrorHander(t *testing.T) { 627 + const src = "#\n" + // illegal character, cause an error 628 + "# #\n" + // two errors on the same line 629 + "//line File2:20\n" + 630 + "#\n" + // different file, but same line 631 + "//line File2:1\n" + 632 + "# #\n" + // same file, decreasing line number 633 + "//line File1:1\n" + 634 + "# # #" // original file, line 1 again 635 + 636 + var list errors.List 637 + eh := func(pos token.Position, msg string) { list.AddNew(pos, msg) } 638 + 639 + var s Scanner 640 + s.Init(fset.AddFile("File1", fset.Base(), len(src)), []byte(src), eh, dontInsertCommas) 641 + for { 642 + if _, tok, _ := s.Scan(); tok == token.EOF { 643 + break 644 + } 645 + } 646 + 647 + if len(list) != s.ErrorCount { 648 + t.Errorf("found %d errors, expected %d", len(list), s.ErrorCount) 649 + } 650 + 651 + if len(list) != 9 { 652 + t.Errorf("found %d raw errors, expected 9", len(list)) 653 + errors.Print(os.Stderr, list) 654 + } 655 + 656 + list.Sort() 657 + if len(list) != 9 { 658 + t.Errorf("found %d sorted errors, expected 9", len(list)) 659 + errors.Print(os.Stderr, list) 660 + } 661 + 662 + list.RemoveMultiples() 663 + if len(list) != 4 { 664 + t.Errorf("found %d one-per-line errors, expected 4", len(list)) 665 + errors.Print(os.Stderr, list) 666 + } 667 + } 668 + 669 + type errorCollector struct { 670 + cnt int // number of errors encountered 671 + msg string // last error message encountered 672 + pos token.Position // last error position encountered 673 + } 674 + 675 + func checkError(t *testing.T, src string, tok token.Token, pos int, lit, err string) { 676 + t.Helper() 677 + var s Scanner 678 + var h errorCollector 679 + eh := func(pos token.Position, msg string) { 680 + h.cnt++ 681 + h.msg = msg 682 + h.pos = pos 683 + } 684 + s.Init(fset.AddFile("", fset.Base(), len(src)), []byte(src), eh, ScanComments|dontInsertCommas) 685 + _, tok0, lit0 := s.Scan() 686 + if tok0 != tok { 687 + t.Errorf("%q: got %s, expected %s", src, tok0, tok) 688 + } 689 + if tok0 != token.ILLEGAL && lit0 != lit { 690 + t.Errorf("%q: got literal %q, expected %q", src, lit0, lit) 691 + } 692 + cnt := 0 693 + if err != "" { 694 + cnt = 1 695 + } 696 + if h.cnt != cnt { 697 + t.Errorf("%q: got cnt %d, expected %d", src, h.cnt, cnt) 698 + } 699 + if h.msg != err { 700 + t.Errorf("%q: got msg %q, expected %q", src, h.msg, err) 701 + } 702 + if h.pos.Offset != pos { 703 + t.Errorf("%q: got offset %d, expected %d", src, h.pos.Offset, pos) 704 + } 705 + } 706 + 707 + var errorTests = []struct { 708 + src string 709 + tok token.Token 710 + pos int 711 + lit string 712 + err string 713 + }{ 714 + {"\a", token.ILLEGAL, 0, "", "illegal character U+0007"}, 715 + {`?`, token.ILLEGAL, 0, "", "illegal character U+003F '?'"}, 716 + {`…`, token.ILLEGAL, 0, "", "illegal character U+2026 '…'"}, 717 + {`_|`, token.ILLEGAL, 0, "", "illegal token '_|'; expected '_'"}, 718 + // {`' '`, STRING, 0, `' '`, ""}, 719 + // {"`\0`", STRING, 3, `'\0'`, "illegal character U+0027 ''' in escape sequence"}, 720 + // {`'\07'`, STRING, 4, `'\07'`, "illegal character U+0027 ''' in escape sequence"}, 721 + {`"\8"`, token.STRING, 2, `"\8"`, "unknown escape sequence"}, 722 + {`"\08"`, token.STRING, 3, `"\08"`, "illegal character U+0038 '8' in escape sequence"}, 723 + {`"\x"`, token.STRING, 3, `"\x"`, "illegal character U+0022 '\"' in escape sequence"}, 724 + {`"\x0"`, token.STRING, 4, `"\x0"`, "illegal character U+0022 '\"' in escape sequence"}, 725 + {`"\x0g"`, token.STRING, 4, `"\x0g"`, "illegal character U+0067 'g' in escape sequence"}, 726 + {`"\u"`, token.STRING, 3, `"\u"`, "illegal character U+0022 '\"' in escape sequence"}, 727 + {`"\u0"`, token.STRING, 4, `"\u0"`, "illegal character U+0022 '\"' in escape sequence"}, 728 + {`"\u00"`, token.STRING, 5, `"\u00"`, "illegal character U+0022 '\"' in escape sequence"}, 729 + {`"\u000"`, token.STRING, 6, `"\u000"`, "illegal character U+0022 '\"' in escape sequence"}, 730 + // {`"\u000`, token.STRING, 6, `"\u000`, "string literal not terminated"}, two errors 731 + {`"\u0000"`, token.STRING, 0, `"\u0000"`, ""}, 732 + {`"\U"`, token.STRING, 3, `"\U"`, "illegal character U+0022 '\"' in escape sequence"}, 733 + {`"\U0"`, token.STRING, 4, `"\U0"`, "illegal character U+0022 '\"' in escape sequence"}, 734 + {`"\U00"`, token.STRING, 5, `"\U00"`, "illegal character U+0022 '\"' in escape sequence"}, 735 + {`"\U000"`, token.STRING, 6, `"\U000"`, "illegal character U+0022 '\"' in escape sequence"}, 736 + {`"\U0000"`, token.STRING, 7, `"\U0000"`, "illegal character U+0022 '\"' in escape sequence"}, 737 + {`"\U00000"`, token.STRING, 8, `"\U00000"`, "illegal character U+0022 '\"' in escape sequence"}, 738 + {`"\U000000"`, token.STRING, 9, `"\U000000"`, "illegal character U+0022 '\"' in escape sequence"}, 739 + {`"\U0000000"`, token.STRING, 10, `"\U0000000"`, "illegal character U+0022 '\"' in escape sequence"}, 740 + // {`"\U0000000`, token.STRING, 10, `"\U0000000`, "string literal not terminated"}, // escape sequence not terminated"}, two errors 741 + {`"\U00000000"`, token.STRING, 0, `"\U00000000"`, ""}, 742 + {`"\Uffffffff"`, token.STRING, 2, `"\Uffffffff"`, "escape sequence is invalid Unicode code point"}, 743 + {`'`, token.STRING, 0, `'`, "string literal not terminated"}, 744 + // TODO 745 + // {`'\`, token.STRING, 0, `'\`, "raw string literal not terminated"}, // "escape sequence not terminated"}, 746 + // {"`\n", token.STRING, 0, s"`\n", "raw string literal not terminated"}, 747 + // {"'\n ", token.STRING, 0, "'", "raw string literal not terminated"}, 748 + {`""`, token.STRING, 0, `""`, ""}, 749 + {`"abc`, token.STRING, 0, `"abc`, "string literal not terminated"}, 750 + {`""abc`, token.STRING, 0, `""`, ""}, 751 + {`"""abc`, token.STRING, 0, `"""abc`, "string literal not terminated"}, 752 + {`'''abc`, token.STRING, 0, `'''abc`, "string literal not terminated"}, 753 + {"\"abc\n", token.STRING, 0, `"abc`, "string literal not terminated"}, 754 + {"\"abc\n ", token.STRING, 0, `"abc`, "string literal not terminated"}, 755 + {"``", token.STRING, 0, "``", ""}, 756 + // {"$", IDENT, 0, "$", ""}, // TODO: for root of file? 757 + {"`", token.STRING, 0, "`", "raw string literal not terminated"}, 758 + {"''", token.STRING, 0, "''", ""}, 759 + {"'", token.STRING, 0, "'", "string literal not terminated"}, 760 + {"/**/", token.COMMENT, 0, "/**/", ""}, 761 + {"/*", token.COMMENT, 0, "/*", "comment not terminated"}, 762 + {"077", token.INT, 0, "077", ""}, 763 + {"078.", token.FLOAT, 0, "078.", ""}, 764 + {"07801234567.", token.FLOAT, 0, "07801234567.", ""}, 765 + {"078e0", token.FLOAT, 0, "078e0", ""}, 766 + {"078", token.INT, 0, "078", "illegal octal number"}, 767 + {"07800000009", token.INT, 0, "07800000009", "illegal octal number"}, 768 + {"0x", token.INT, 0, "0x", "illegal hexadecimal number"}, 769 + {"0X", token.INT, 0, "0X", "illegal hexadecimal number"}, 770 + {"0Xbeef_", token.INT, 6, "0Xbeef_", "illegal '_' in number"}, 771 + {"0b", token.INT, 0, "0b", "illegal binary number"}, 772 + {"0B", token.INT, 0, "0B", "illegal binary number"}, 773 + // {"123456789012345678890_i", IMAG, 21, "123456789012345678890_i", "illegal '_' in number"}, 774 + {"\"abc\x00def\"", token.STRING, 4, "\"abc\x00def\"", "illegal character NUL"}, 775 + {"\"abc\x80def\"", token.STRING, 4, "\"abc\x80def\"", "illegal UTF-8 encoding"}, 776 + {"\ufeff\ufeff", token.ILLEGAL, 3, "\ufeff\ufeff", "illegal byte order mark"}, // only first BOM is ignored 777 + {"//\ufeff", token.COMMENT, 2, "//\ufeff", "illegal byte order mark"}, // only first BOM is ignored 778 + // {"`a\ufeff`", IDENT, 2, "`a\ufeff`", "illegal byte order mark"}, // only first BOM is ignored 779 + {`"` + "abc\ufeffdef" + `"`, token.STRING, 4, `"` + "abc\ufeffdef" + `"`, "illegal byte order mark"}, // only first BOM is ignored 780 + } 781 + 782 + func TestScanErrors(t *testing.T) { 783 + for _, e := range errorTests { 784 + t.Run(e.src, func(t *testing.T) { 785 + checkError(t, e.src, e.tok, e.pos, e.lit, e.err) 786 + }) 787 + } 788 + } 789 + 790 + // Verify that no comments show up as literal values when skipping comments. 791 + func TestNoLiteralComments(t *testing.T) { 792 + var src = ` 793 + a: { 794 + A: 1 // foo 795 + } 796 + 797 + b: { 798 + B: 2 799 + // foo 800 + } 801 + 802 + c: 3 // foo 803 + 804 + d: 4 805 + // foo 806 + 807 + b anycode(): { 808 + // foo 809 + } 810 + ` 811 + var s Scanner 812 + s.Init(fset.AddFile("", fset.Base(), len(src)), []byte(src), nil, 0) 813 + for { 814 + pos, tok, lit := s.Scan() 815 + class := tokenclass(tok) 816 + if lit != "" && class != keyword && class != literal && tok != token.COMMA { 817 + t.Errorf("%s: tok = %s, lit = %q", fset.Position(pos), tok, lit) 818 + } 819 + if tok <= token.EOF { 820 + break 821 + } 822 + } 823 + } 824 + 825 + func BenchmarkScan(b *testing.B) { 826 + b.StopTimer() 827 + fset := token.NewFileSet() 828 + file := fset.AddFile("", fset.Base(), len(source)) 829 + var s Scanner 830 + b.StartTimer() 831 + for i := 0; i < b.N; i++ { 832 + s.Init(file, source, nil, ScanComments) 833 + for { 834 + _, tok, _ := s.Scan() 835 + if tok == token.EOF { 836 + break 837 + } 838 + } 839 + } 840 + } 841 + 842 + func BenchmarkScanFile(b *testing.B) { 843 + b.StopTimer() 844 + const filename = "go" 845 + src, err := ioutil.ReadFile(filename) 846 + if err != nil { 847 + panic(err) 848 + } 849 + fset := token.NewFileSet() 850 + file := fset.AddFile(filename, fset.Base(), len(src)) 851 + b.SetBytes(int64(len(src))) 852 + var s Scanner 853 + b.StartTimer() 854 + for i := 0; i < b.N; i++ { 855 + s.Init(file, src, nil, ScanComments) 856 + for { 857 + _, tok, _ := s.Scan() 858 + if tok == token.EOF { 859 + break 860 + } 861 + } 862 + } 863 + } 864 + 865 + func TestScanner_next(t *testing.T) { 866 + tests := []struct { 867 + name string 868 + s *Scanner 869 + }{ 870 + // TODO: Add test cases. 871 + } 872 + for _, tt := range tests { 873 + tt.s.next() 874 + } 875 + } 876 + 877 + func TestScanner_Init(t *testing.T) { 878 + type args struct { 879 + file *token.File 880 + src []byte 881 + err errors.Handler 882 + mode Mode 883 + } 884 + tests := []struct { 885 + name string 886 + s *Scanner 887 + args args 888 + }{ 889 + // TODO: Add test cases. 890 + } 891 + for _, tt := range tests { 892 + tt.s.Init(tt.args.file, tt.args.src, tt.args.err, tt.args.mode) 893 + } 894 + } 895 + 896 + func TestScanner_error(t *testing.T) { 897 + type args struct { 898 + offs int 899 + msg string 900 + } 901 + tests := []struct { 902 + name string 903 + s *Scanner 904 + args args 905 + }{ 906 + // TODO: Add test cases. 907 + } 908 + for _, tt := range tests { 909 + tt.s.error(tt.args.offs, tt.args.msg) 910 + } 911 + } 912 + 913 + func TestScanner_interpretLineComment(t *testing.T) { 914 + type args struct { 915 + text []byte 916 + } 917 + tests := []struct { 918 + name string 919 + s *Scanner 920 + args args 921 + }{ 922 + // TODO: Add test cases. 923 + } 924 + for _, tt := range tests { 925 + tt.s.interpretLineComment(tt.args.text) 926 + } 927 + } 928 + 929 + func TestScanner_scanComment(t *testing.T) { 930 + tests := []struct { 931 + name string 932 + s *Scanner 933 + want string 934 + }{ 935 + // TODO: Add test cases. 936 + } 937 + for _, tt := range tests { 938 + if got := tt.s.scanComment(); got != tt.want { 939 + t.Errorf("%q. Scanner.scanComment() = %v, want %v", tt.name, got, tt.want) 940 + } 941 + } 942 + } 943 + 944 + func TestScanner_findLineEnd(t *testing.T) { 945 + tests := []struct { 946 + name string 947 + s *Scanner 948 + want bool 949 + }{ 950 + // TODO: Add test cases. 951 + } 952 + for _, tt := range tests { 953 + if got := tt.s.findLineEnd(); got != tt.want { 954 + t.Errorf("%q. Scanner.findLineEnd() = %v, want %v", tt.name, got, tt.want) 955 + } 956 + } 957 + } 958 + 959 + func Test_isLetter(t *testing.T) { 960 + type args struct { 961 + ch rune 962 + } 963 + tests := []struct { 964 + name string 965 + args args 966 + want bool 967 + }{ 968 + // TODO: Add test cases. 969 + } 970 + for _, tt := range tests { 971 + if got := isLetter(tt.args.ch); got != tt.want { 972 + t.Errorf("%q. isLetter() = %v, want %v", tt.name, got, tt.want) 973 + } 974 + } 975 + } 976 + 977 + func Test_isDigit(t *testing.T) { 978 + type args struct { 979 + ch rune 980 + } 981 + tests := []struct { 982 + name string 983 + args args 984 + want bool 985 + }{ 986 + // TODO: Add test cases. 987 + } 988 + for _, tt := range tests { 989 + if got := isDigit(tt.args.ch); got != tt.want { 990 + t.Errorf("%q. isDigit() = %v, want %v", tt.name, got, tt.want) 991 + } 992 + } 993 + } 994 + 995 + func TestScanner_scanIdentifier(t *testing.T) { 996 + tests := []struct { 997 + name string 998 + s *Scanner 999 + want string 1000 + }{ 1001 + // TODO: Add test cases. 1002 + } 1003 + for _, tt := range tests { 1004 + if got := tt.s.scanIdentifier(); got != tt.want { 1005 + t.Errorf("%q. Scanner.scanIdentifier() = %v, want %v", tt.name, got, tt.want) 1006 + } 1007 + } 1008 + } 1009 + 1010 + func Test_digitVal(t *testing.T) { 1011 + type args struct { 1012 + ch rune 1013 + } 1014 + tests := []struct { 1015 + name string 1016 + args args 1017 + want int 1018 + }{ 1019 + // TODO: Add test cases. 1020 + } 1021 + for _, tt := range tests { 1022 + if got := digitVal(tt.args.ch); got != tt.want { 1023 + t.Errorf("%q. digitVal() = %v, want %v", tt.name, got, tt.want) 1024 + } 1025 + } 1026 + } 1027 + 1028 + func TestScanner_scanMantissa(t *testing.T) { 1029 + type args struct { 1030 + base int 1031 + } 1032 + tests := []struct { 1033 + name string 1034 + s *Scanner 1035 + args args 1036 + }{ 1037 + // TODO: Add test cases. 1038 + } 1039 + for _, tt := range tests { 1040 + tt.s.scanMantissa(tt.args.base) 1041 + } 1042 + } 1043 + 1044 + func TestScanner_scanNumber(t *testing.T) { 1045 + type args struct { 1046 + seenDecimalPoint bool 1047 + } 1048 + tests := []struct { 1049 + name string 1050 + s *Scanner 1051 + args args 1052 + want token.Token 1053 + want1 string 1054 + }{ 1055 + // TODO: Add test cases. 1056 + } 1057 + for _, tt := range tests { 1058 + got, got1 := tt.s.scanNumber(tt.args.seenDecimalPoint) 1059 + if !reflect.DeepEqual(got, tt.want) { 1060 + t.Errorf("%q. Scanner.scanNumber() got = %v, want %v", tt.name, got, tt.want) 1061 + } 1062 + if got1 != tt.want1 { 1063 + t.Errorf("%q. Scanner.scanNumber() got1 = %v, want %v", tt.name, got1, tt.want1) 1064 + } 1065 + } 1066 + } 1067 + 1068 + func TestScanner_scanEscape(t *testing.T) { 1069 + type args struct { 1070 + quote rune 1071 + } 1072 + tests := []struct { 1073 + name string 1074 + s *Scanner 1075 + args args 1076 + want bool 1077 + }{ 1078 + // TODO: Add test cases. 1079 + } 1080 + for _, tt := range tests { 1081 + if got, _ := tt.s.scanEscape(tt.args.quote); got != tt.want { 1082 + t.Errorf("%q. Scanner.scanEscape() = %v, want %v", tt.name, got, tt.want) 1083 + } 1084 + } 1085 + } 1086 + 1087 + func TestScanner_scanString(t *testing.T) { 1088 + tests := []struct { 1089 + name string 1090 + s *Scanner 1091 + want string 1092 + }{ 1093 + // TODO: Add test cases. 1094 + } 1095 + for _, tt := range tests { 1096 + if _, got := tt.s.scanString(rune(tt.name[0]), 1, 1); got != tt.want { 1097 + t.Errorf("%q. Scanner.scanString() = %v, want %v", tt.name, got, tt.want) 1098 + } 1099 + } 1100 + } 1101 + 1102 + func Test_stripCR(t *testing.T) { 1103 + type args struct { 1104 + b []byte 1105 + } 1106 + tests := []struct { 1107 + name string 1108 + args args 1109 + want []byte 1110 + }{ 1111 + // TODO: Add test cases. 1112 + } 1113 + for _, tt := range tests { 1114 + if got := stripCR(tt.args.b); !reflect.DeepEqual(got, tt.want) { 1115 + t.Errorf("%q. stripCR() = %v, want %v", tt.name, got, tt.want) 1116 + } 1117 + } 1118 + } 1119 + 1120 + func TestScanner_scanRawString(t *testing.T) { 1121 + tests := []struct { 1122 + name string 1123 + s *Scanner 1124 + want string 1125 + }{ 1126 + // TODO: Add test cases. 1127 + } 1128 + for _, tt := range tests { 1129 + if got := tt.s.scanRawString(); got != tt.want { 1130 + t.Errorf("%q. Scanner.scanRawString() = %v, want %v", tt.name, got, tt.want) 1131 + } 1132 + } 1133 + } 1134 + 1135 + func TestScanner_skipWhitespace(t *testing.T) { 1136 + tests := []struct { 1137 + name string 1138 + s *Scanner 1139 + }{ 1140 + // TODO: Add test cases. 1141 + } 1142 + for _, tt := range tests { 1143 + tt.s.skipWhitespace(1) 1144 + } 1145 + } 1146 + 1147 + func TestScanner_switch2(t *testing.T) { 1148 + type args struct { 1149 + tok0 token.Token 1150 + tok1 token.Token 1151 + } 1152 + tests := []struct { 1153 + name string 1154 + s *Scanner 1155 + args args 1156 + want token.Token 1157 + }{ 1158 + // TODO: Add test cases. 1159 + } 1160 + for _, tt := range tests { 1161 + if got := tt.s.switch2(tt.args.tok0, tt.args.tok1); !reflect.DeepEqual(got, tt.want) { 1162 + t.Errorf("%q. Scanner.switch2() = %v, want %v", tt.name, got, tt.want) 1163 + } 1164 + } 1165 + } 1166 + 1167 + func TestScanner_Scan(t *testing.T) { 1168 + tests := []struct { 1169 + name string 1170 + s *Scanner 1171 + wantPos token.Pos 1172 + wantTok token.Token 1173 + wantLit string 1174 + }{ 1175 + // TODO: Add test cases. 1176 + } 1177 + for _, tt := range tests { 1178 + gotPos, gotTok, gotLit := tt.s.Scan() 1179 + if !reflect.DeepEqual(gotPos, tt.wantPos) { 1180 + t.Errorf("%q. Scanner.Scan() gotPos = %v, want %v", tt.name, gotPos, tt.wantPos) 1181 + } 1182 + if !reflect.DeepEqual(gotTok, tt.wantTok) { 1183 + t.Errorf("%q. Scanner.Scan() gotTok = %v, want %v", tt.name, gotTok, tt.wantTok) 1184 + } 1185 + if gotLit != tt.wantLit { 1186 + t.Errorf("%q. Scanner.Scan() gotLit = %v, want %v", tt.name, gotLit, tt.wantLit) 1187 + } 1188 + } 1189 + }