Source file src/go/scanner/scanner.go
1
2
3
4
5
6
7
8
9 package scanner
10
11 import (
12 "bytes"
13 "fmt"
14 "go/token"
15 "path/filepath"
16 "strconv"
17 "unicode"
18 "unicode/utf8"
19 )
20
21
22
23
24
25
26 type ErrorHandler func(pos token.Position, msg string)
27
28
29
30
31
32 type Scanner struct {
33
34 file *token.File
35 dir string
36 src []byte
37 err ErrorHandler
38 mode Mode
39
40
41 ch rune
42 offset int
43 rdOffset int
44 lineOffset int
45 insertSemi bool
46
47
48 ErrorCount int
49 }
50
51 const bom = 0xFEFF
52
53
54
55
56 func (s *Scanner) next() {
57 if s.rdOffset < len(s.src) {
58 s.offset = s.rdOffset
59 if s.ch == '\n' {
60 s.lineOffset = s.offset
61 s.file.AddLine(s.offset)
62 }
63 r, w := rune(s.src[s.rdOffset]), 1
64 switch {
65 case r == 0:
66 s.error(s.offset, "illegal character NUL")
67 case r >= utf8.RuneSelf:
68
69 r, w = utf8.DecodeRune(s.src[s.rdOffset:])
70 if r == utf8.RuneError && w == 1 {
71 s.error(s.offset, "illegal UTF-8 encoding")
72 } else if r == bom && s.offset > 0 {
73 s.error(s.offset, "illegal byte order mark")
74 }
75 }
76 s.rdOffset += w
77 s.ch = r
78 } else {
79 s.offset = len(s.src)
80 if s.ch == '\n' {
81 s.lineOffset = s.offset
82 s.file.AddLine(s.offset)
83 }
84 s.ch = -1
85 }
86 }
87
88
89
90 func (s *Scanner) peek() byte {
91 if s.rdOffset < len(s.src) {
92 return s.src[s.rdOffset]
93 }
94 return 0
95 }
96
97
98
99
100 type Mode uint
101
102 const (
103 ScanComments Mode = 1 << iota
104 dontInsertSemis
105 )
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
123
124 if file.Size() != len(src) {
125 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
126 }
127 s.file = file
128 s.dir, _ = filepath.Split(file.Name())
129 s.src = src
130 s.err = err
131 s.mode = mode
132
133 s.ch = ' '
134 s.offset = 0
135 s.rdOffset = 0
136 s.lineOffset = 0
137 s.insertSemi = false
138 s.ErrorCount = 0
139
140 s.next()
141 if s.ch == bom {
142 s.next()
143 }
144 }
145
146 func (s *Scanner) error(offs int, msg string) {
147 if s.err != nil {
148 s.err(s.file.Position(s.file.Pos(offs)), msg)
149 }
150 s.ErrorCount++
151 }
152
153 func (s *Scanner) errorf(offs int, format string, args ...interface{}) {
154 s.error(offs, fmt.Sprintf(format, args...))
155 }
156
157 func (s *Scanner) scanComment() string {
158
159 offs := s.offset - 1
160 next := -1
161 numCR := 0
162
163 if s.ch == '/' {
164
165
166 s.next()
167 for s.ch != '\n' && s.ch >= 0 {
168 if s.ch == '\r' {
169 numCR++
170 }
171 s.next()
172 }
173
174 next = s.offset
175 if s.ch == '\n' {
176 next++
177 }
178 goto exit
179 }
180
181
182 s.next()
183 for s.ch >= 0 {
184 ch := s.ch
185 if ch == '\r' {
186 numCR++
187 }
188 s.next()
189 if ch == '*' && s.ch == '/' {
190 s.next()
191 next = s.offset
192 goto exit
193 }
194 }
195
196 s.error(offs, "comment not terminated")
197
198 exit:
199 lit := s.src[offs:s.offset]
200
201
202
203
204
205
206 if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
207 lit = lit[:len(lit)-1]
208 numCR--
209 }
210
211
212
213 if next >= 0 && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
214 s.updateLineInfo(next, offs, lit)
215 }
216
217 if numCR > 0 {
218 lit = stripCR(lit, lit[1] == '*')
219 }
220
221 return string(lit)
222 }
223
224 var prefix = []byte("line ")
225
226
227
228
229 func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
230
231 if text[1] == '*' {
232 text = text[:len(text)-2]
233 }
234 text = text[7:]
235 offs += 7
236
237 i, n, ok := trailingDigits(text)
238 if i == 0 {
239 return
240 }
241
242
243 if !ok {
244
245 s.error(offs+i, "invalid line number: "+string(text[i:]))
246 return
247 }
248
249 var line, col int
250 i2, n2, ok2 := trailingDigits(text[:i-1])
251 if ok2 {
252
253 i, i2 = i2, i
254 line, col = n2, n
255 if col == 0 {
256 s.error(offs+i2, "invalid column number: "+string(text[i2:]))
257 return
258 }
259 text = text[:i2-1]
260 } else {
261
262 line = n
263 }
264
265 if line == 0 {
266 s.error(offs+i, "invalid line number: "+string(text[i:]))
267 return
268 }
269
270
271
272 filename := string(text[:i-1])
273 if filename == "" && ok2 {
274 filename = s.file.Position(s.file.Pos(offs)).Filename
275 } else if filename != "" {
276
277
278
279 filename = filepath.Clean(filename)
280 if !filepath.IsAbs(filename) {
281 filename = filepath.Join(s.dir, filename)
282 }
283 }
284
285 s.file.AddLineColumnInfo(next, filename, line, col)
286 }
287
288 func trailingDigits(text []byte) (int, int, bool) {
289 i := bytes.LastIndexByte(text, ':')
290 if i < 0 {
291 return 0, 0, false
292 }
293
294 n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
295 return i + 1, int(n), err == nil
296 }
297
298 func (s *Scanner) findLineEnd() bool {
299
300
301 defer func(offs int) {
302
303 s.ch = '/'
304 s.offset = offs
305 s.rdOffset = offs + 1
306 s.next()
307 }(s.offset - 1)
308
309
310 for s.ch == '/' || s.ch == '*' {
311 if s.ch == '/' {
312
313 return true
314 }
315
316 s.next()
317 for s.ch >= 0 {
318 ch := s.ch
319 if ch == '\n' {
320 return true
321 }
322 s.next()
323 if ch == '*' && s.ch == '/' {
324 s.next()
325 break
326 }
327 }
328 s.skipWhitespace()
329 if s.ch < 0 || s.ch == '\n' {
330 return true
331 }
332 if s.ch != '/' {
333
334 return false
335 }
336 s.next()
337 }
338
339 return false
340 }
341
342 func isLetter(ch rune) bool {
343 return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
344 }
345
346 func isDigit(ch rune) bool {
347 return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
348 }
349
350 func (s *Scanner) scanIdentifier() string {
351 offs := s.offset
352 for isLetter(s.ch) || isDigit(s.ch) {
353 s.next()
354 }
355 return string(s.src[offs:s.offset])
356 }
357
358 func digitVal(ch rune) int {
359 switch {
360 case '0' <= ch && ch <= '9':
361 return int(ch - '0')
362 case 'a' <= lower(ch) && lower(ch) <= 'f':
363 return int(lower(ch) - 'a' + 10)
364 }
365 return 16
366 }
367
368 func lower(ch rune) rune { return ('a' - 'A') | ch }
369 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
370 func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
371
372
373
374
375
376
377
378 func (s *Scanner) digits(base int, invalid *int) (digsep int) {
379 if base <= 10 {
380 max := rune('0' + base)
381 for isDecimal(s.ch) || s.ch == '_' {
382 ds := 1
383 if s.ch == '_' {
384 ds = 2
385 } else if s.ch >= max && *invalid < 0 {
386 *invalid = int(s.offset)
387 }
388 digsep |= ds
389 s.next()
390 }
391 } else {
392 for isHex(s.ch) || s.ch == '_' {
393 ds := 1
394 if s.ch == '_' {
395 ds = 2
396 }
397 digsep |= ds
398 s.next()
399 }
400 }
401 return
402 }
403
404 func (s *Scanner) scanNumber() (token.Token, string) {
405 offs := s.offset
406 tok := token.ILLEGAL
407
408 base := 10
409 prefix := rune(0)
410 digsep := 0
411 invalid := -1
412
413
414 if s.ch != '.' {
415 tok = token.INT
416 if s.ch == '0' {
417 s.next()
418 switch lower(s.ch) {
419 case 'x':
420 s.next()
421 base, prefix = 16, 'x'
422 case 'o':
423 s.next()
424 base, prefix = 8, 'o'
425 case 'b':
426 s.next()
427 base, prefix = 2, 'b'
428 default:
429 base, prefix = 8, '0'
430 digsep = 1
431 }
432 }
433 digsep |= s.digits(base, &invalid)
434 }
435
436
437 if s.ch == '.' {
438 tok = token.FLOAT
439 if prefix == 'o' || prefix == 'b' {
440 s.error(s.offset, "invalid radix point in "+litname(prefix))
441 }
442 s.next()
443 digsep |= s.digits(base, &invalid)
444 }
445
446 if digsep&1 == 0 {
447 s.error(s.offset, litname(prefix)+" has no digits")
448 }
449
450
451 if e := lower(s.ch); e == 'e' || e == 'p' {
452 switch {
453 case e == 'e' && prefix != 0 && prefix != '0':
454 s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
455 case e == 'p' && prefix != 'x':
456 s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
457 }
458 s.next()
459 tok = token.FLOAT
460 if s.ch == '+' || s.ch == '-' {
461 s.next()
462 }
463 ds := s.digits(10, nil)
464 digsep |= ds
465 if ds&1 == 0 {
466 s.error(s.offset, "exponent has no digits")
467 }
468 } else if prefix == 'x' && tok == token.FLOAT {
469 s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
470 }
471
472
473 if s.ch == 'i' {
474 tok = token.IMAG
475 s.next()
476 }
477
478 lit := string(s.src[offs:s.offset])
479 if tok == token.INT && invalid >= 0 {
480 s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
481 }
482 if digsep&2 != 0 {
483 if i := invalidSep(lit); i >= 0 {
484 s.error(offs+i, "'_' must separate successive digits")
485 }
486 }
487
488 return tok, lit
489 }
490
491 func litname(prefix rune) string {
492 switch prefix {
493 case 'x':
494 return "hexadecimal literal"
495 case 'o', '0':
496 return "octal literal"
497 case 'b':
498 return "binary literal"
499 }
500 return "decimal literal"
501 }
502
503
504 func invalidSep(x string) int {
505 x1 := ' '
506 d := '.'
507 i := 0
508
509
510 if len(x) >= 2 && x[0] == '0' {
511 x1 = lower(rune(x[1]))
512 if x1 == 'x' || x1 == 'o' || x1 == 'b' {
513 d = '0'
514 i = 2
515 }
516 }
517
518
519 for ; i < len(x); i++ {
520 p := d
521 d = rune(x[i])
522 switch {
523 case d == '_':
524 if p != '0' {
525 return i
526 }
527 case isDecimal(d) || x1 == 'x' && isHex(d):
528 d = '0'
529 default:
530 if p == '_' {
531 return i - 1
532 }
533 d = '.'
534 }
535 }
536 if d == '_' {
537 return len(x) - 1
538 }
539
540 return -1
541 }
542
543
544
545
546
547 func (s *Scanner) scanEscape(quote rune) bool {
548 offs := s.offset
549
550 var n int
551 var base, max uint32
552 switch s.ch {
553 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
554 s.next()
555 return true
556 case '0', '1', '2', '3', '4', '5', '6', '7':
557 n, base, max = 3, 8, 255
558 case 'x':
559 s.next()
560 n, base, max = 2, 16, 255
561 case 'u':
562 s.next()
563 n, base, max = 4, 16, unicode.MaxRune
564 case 'U':
565 s.next()
566 n, base, max = 8, 16, unicode.MaxRune
567 default:
568 msg := "unknown escape sequence"
569 if s.ch < 0 {
570 msg = "escape sequence not terminated"
571 }
572 s.error(offs, msg)
573 return false
574 }
575
576 var x uint32
577 for n > 0 {
578 d := uint32(digitVal(s.ch))
579 if d >= base {
580 msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
581 if s.ch < 0 {
582 msg = "escape sequence not terminated"
583 }
584 s.error(s.offset, msg)
585 return false
586 }
587 x = x*base + d
588 s.next()
589 n--
590 }
591
592 if x > max || 0xD800 <= x && x < 0xE000 {
593 s.error(offs, "escape sequence is invalid Unicode code point")
594 return false
595 }
596
597 return true
598 }
599
600 func (s *Scanner) scanRune() string {
601
602 offs := s.offset - 1
603
604 valid := true
605 n := 0
606 for {
607 ch := s.ch
608 if ch == '\n' || ch < 0 {
609
610 if valid {
611 s.error(offs, "rune literal not terminated")
612 valid = false
613 }
614 break
615 }
616 s.next()
617 if ch == '\'' {
618 break
619 }
620 n++
621 if ch == '\\' {
622 if !s.scanEscape('\'') {
623 valid = false
624 }
625
626 }
627 }
628
629 if valid && n != 1 {
630 s.error(offs, "illegal rune literal")
631 }
632
633 return string(s.src[offs:s.offset])
634 }
635
636 func (s *Scanner) scanString() string {
637
638 offs := s.offset - 1
639
640 for {
641 ch := s.ch
642 if ch == '\n' || ch < 0 {
643 s.error(offs, "string literal not terminated")
644 break
645 }
646 s.next()
647 if ch == '"' {
648 break
649 }
650 if ch == '\\' {
651 s.scanEscape('"')
652 }
653 }
654
655 return string(s.src[offs:s.offset])
656 }
657
658 func stripCR(b []byte, comment bool) []byte {
659 c := make([]byte, len(b))
660 i := 0
661 for j, ch := range b {
662
663
664
665
666
667 if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
668 c[i] = ch
669 i++
670 }
671 }
672 return c[:i]
673 }
674
675 func (s *Scanner) scanRawString() string {
676
677 offs := s.offset - 1
678
679 hasCR := false
680 for {
681 ch := s.ch
682 if ch < 0 {
683 s.error(offs, "raw string literal not terminated")
684 break
685 }
686 s.next()
687 if ch == '`' {
688 break
689 }
690 if ch == '\r' {
691 hasCR = true
692 }
693 }
694
695 lit := s.src[offs:s.offset]
696 if hasCR {
697 lit = stripCR(lit, false)
698 }
699
700 return string(lit)
701 }
702
703 func (s *Scanner) skipWhitespace() {
704 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
705 s.next()
706 }
707 }
708
709
710
711
712
713
714
715 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
716 if s.ch == '=' {
717 s.next()
718 return tok1
719 }
720 return tok0
721 }
722
723 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
724 if s.ch == '=' {
725 s.next()
726 return tok1
727 }
728 if s.ch == ch2 {
729 s.next()
730 return tok2
731 }
732 return tok0
733 }
734
735 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
736 if s.ch == '=' {
737 s.next()
738 return tok1
739 }
740 if s.ch == ch2 {
741 s.next()
742 if s.ch == '=' {
743 s.next()
744 return tok3
745 }
746 return tok2
747 }
748 return tok0
749 }
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
783 scanAgain:
784 s.skipWhitespace()
785
786
787 pos = s.file.Pos(s.offset)
788
789
790 insertSemi := false
791 switch ch := s.ch; {
792 case isLetter(ch):
793 lit = s.scanIdentifier()
794 if len(lit) > 1 {
795
796 tok = token.Lookup(lit)
797 switch tok {
798 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
799 insertSemi = true
800 }
801 } else {
802 insertSemi = true
803 tok = token.IDENT
804 }
805 case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
806 insertSemi = true
807 tok, lit = s.scanNumber()
808 default:
809 s.next()
810 switch ch {
811 case -1:
812 if s.insertSemi {
813 s.insertSemi = false
814 return pos, token.SEMICOLON, "\n"
815 }
816 tok = token.EOF
817 case '\n':
818
819
820
821 s.insertSemi = false
822 return pos, token.SEMICOLON, "\n"
823 case '"':
824 insertSemi = true
825 tok = token.STRING
826 lit = s.scanString()
827 case '\'':
828 insertSemi = true
829 tok = token.CHAR
830 lit = s.scanRune()
831 case '`':
832 insertSemi = true
833 tok = token.STRING
834 lit = s.scanRawString()
835 case ':':
836 tok = s.switch2(token.COLON, token.DEFINE)
837 case '.':
838
839 tok = token.PERIOD
840 if s.ch == '.' && s.peek() == '.' {
841 s.next()
842 s.next()
843 tok = token.ELLIPSIS
844 }
845 case ',':
846 tok = token.COMMA
847 case ';':
848 tok = token.SEMICOLON
849 lit = ";"
850 case '(':
851 tok = token.LPAREN
852 case ')':
853 insertSemi = true
854 tok = token.RPAREN
855 case '[':
856 tok = token.LBRACK
857 case ']':
858 insertSemi = true
859 tok = token.RBRACK
860 case '{':
861 tok = token.LBRACE
862 case '}':
863 insertSemi = true
864 tok = token.RBRACE
865 case '+':
866 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
867 if tok == token.INC {
868 insertSemi = true
869 }
870 case '-':
871 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
872 if tok == token.DEC {
873 insertSemi = true
874 }
875 case '*':
876 tok = s.switch2(token.MUL, token.MUL_ASSIGN)
877 case '/':
878 if s.ch == '/' || s.ch == '*' {
879
880 if s.insertSemi && s.findLineEnd() {
881
882 s.ch = '/'
883 s.offset = s.file.Offset(pos)
884 s.rdOffset = s.offset + 1
885 s.insertSemi = false
886 return pos, token.SEMICOLON, "\n"
887 }
888 comment := s.scanComment()
889 if s.mode&ScanComments == 0 {
890
891 s.insertSemi = false
892 goto scanAgain
893 }
894 tok = token.COMMENT
895 lit = comment
896 } else {
897 tok = s.switch2(token.QUO, token.QUO_ASSIGN)
898 }
899 case '%':
900 tok = s.switch2(token.REM, token.REM_ASSIGN)
901 case '^':
902 tok = s.switch2(token.XOR, token.XOR_ASSIGN)
903 case '<':
904 if s.ch == '-' {
905 s.next()
906 tok = token.ARROW
907 } else {
908 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
909 }
910 case '>':
911 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
912 case '=':
913 tok = s.switch2(token.ASSIGN, token.EQL)
914 case '!':
915 tok = s.switch2(token.NOT, token.NEQ)
916 case '&':
917 if s.ch == '^' {
918 s.next()
919 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
920 } else {
921 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
922 }
923 case '|':
924 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
925 default:
926
927 if ch != bom {
928 s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
929 }
930 insertSemi = s.insertSemi
931 tok = token.ILLEGAL
932 lit = string(ch)
933 }
934 }
935 if s.mode&dontInsertSemis == 0 {
936 s.insertSemi = insertSemi
937 }
938
939 return
940 }
941
View as plain text