Source file src/pkg/cmd/compile/internal/syntax/scanner.go
1
2
3
4
5
6
7
8
9
10
11
12
13 package syntax
14
15 import (
16 "fmt"
17 "io"
18 "unicode"
19 "unicode/utf8"
20 )
21
22
23
24
25 const (
26 comments uint = 1 << iota
27 directives
28 )
29
30 type scanner struct {
31 source
32 mode uint
33 nlsemi bool
34
35
36 line, col uint
37 tok token
38 lit string
39 kind LitKind
40 op Operator
41 prec int
42 }
43
44 func (s *scanner) init(src io.Reader, errh func(line, col uint, msg string), mode uint) {
45 s.source.init(src, errh)
46 s.mode = mode
47 s.nlsemi = false
48 }
49
50 func (s *scanner) errorf(format string, args ...interface{}) {
51 s.error(fmt.Sprintf(format, args...))
52 }
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74 func (s *scanner) next() {
75 nlsemi := s.nlsemi
76 s.nlsemi = false
77
78 redo:
79
80 c := s.getr()
81 for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' {
82 c = s.getr()
83 }
84
85
86 s.line, s.col = s.source.line0, s.source.col0
87
88 if isLetter(c) || c >= utf8.RuneSelf && s.isIdentRune(c, true) {
89 s.ident()
90 return
91 }
92
93 switch c {
94 case -1:
95 if nlsemi {
96 s.lit = "EOF"
97 s.tok = _Semi
98 break
99 }
100 s.tok = _EOF
101
102 case '\n':
103 s.lit = "newline"
104 s.tok = _Semi
105
106 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
107 s.number(c)
108
109 case '"':
110 s.stdString()
111
112 case '`':
113 s.rawString()
114
115 case '\'':
116 s.rune()
117
118 case '(':
119 s.tok = _Lparen
120
121 case '[':
122 s.tok = _Lbrack
123
124 case '{':
125 s.tok = _Lbrace
126
127 case ',':
128 s.tok = _Comma
129
130 case ';':
131 s.lit = "semicolon"
132 s.tok = _Semi
133
134 case ')':
135 s.nlsemi = true
136 s.tok = _Rparen
137
138 case ']':
139 s.nlsemi = true
140 s.tok = _Rbrack
141
142 case '}':
143 s.nlsemi = true
144 s.tok = _Rbrace
145
146 case ':':
147 if s.getr() == '=' {
148 s.tok = _Define
149 break
150 }
151 s.ungetr()
152 s.tok = _Colon
153
154 case '.':
155 c = s.getr()
156 if isDecimal(c) {
157 s.ungetr()
158 s.unread(1)
159 s.number('.')
160 break
161 }
162 if c == '.' {
163 c = s.getr()
164 if c == '.' {
165 s.tok = _DotDotDot
166 break
167 }
168 s.unread(1)
169 }
170 s.ungetr()
171 s.tok = _Dot
172
173 case '+':
174 s.op, s.prec = Add, precAdd
175 c = s.getr()
176 if c != '+' {
177 goto assignop
178 }
179 s.nlsemi = true
180 s.tok = _IncOp
181
182 case '-':
183 s.op, s.prec = Sub, precAdd
184 c = s.getr()
185 if c != '-' {
186 goto assignop
187 }
188 s.nlsemi = true
189 s.tok = _IncOp
190
191 case '*':
192 s.op, s.prec = Mul, precMul
193
194 if s.getr() == '=' {
195 s.tok = _AssignOp
196 break
197 }
198 s.ungetr()
199 s.tok = _Star
200
201 case '/':
202 c = s.getr()
203 if c == '/' {
204 s.lineComment()
205 goto redo
206 }
207 if c == '*' {
208 s.fullComment()
209 if s.source.line > s.line && nlsemi {
210
211
212 s.lit = "newline"
213 s.tok = _Semi
214 break
215 }
216 goto redo
217 }
218 s.op, s.prec = Div, precMul
219 goto assignop
220
221 case '%':
222 s.op, s.prec = Rem, precMul
223 c = s.getr()
224 goto assignop
225
226 case '&':
227 c = s.getr()
228 if c == '&' {
229 s.op, s.prec = AndAnd, precAndAnd
230 s.tok = _Operator
231 break
232 }
233 s.op, s.prec = And, precMul
234 if c == '^' {
235 s.op = AndNot
236 c = s.getr()
237 }
238 goto assignop
239
240 case '|':
241 c = s.getr()
242 if c == '|' {
243 s.op, s.prec = OrOr, precOrOr
244 s.tok = _Operator
245 break
246 }
247 s.op, s.prec = Or, precAdd
248 goto assignop
249
250 case '^':
251 s.op, s.prec = Xor, precAdd
252 c = s.getr()
253 goto assignop
254
255 case '<':
256 c = s.getr()
257 if c == '=' {
258 s.op, s.prec = Leq, precCmp
259 s.tok = _Operator
260 break
261 }
262 if c == '<' {
263 s.op, s.prec = Shl, precMul
264 c = s.getr()
265 goto assignop
266 }
267 if c == '-' {
268 s.tok = _Arrow
269 break
270 }
271 s.ungetr()
272 s.op, s.prec = Lss, precCmp
273 s.tok = _Operator
274
275 case '>':
276 c = s.getr()
277 if c == '=' {
278 s.op, s.prec = Geq, precCmp
279 s.tok = _Operator
280 break
281 }
282 if c == '>' {
283 s.op, s.prec = Shr, precMul
284 c = s.getr()
285 goto assignop
286 }
287 s.ungetr()
288 s.op, s.prec = Gtr, precCmp
289 s.tok = _Operator
290
291 case '=':
292 if s.getr() == '=' {
293 s.op, s.prec = Eql, precCmp
294 s.tok = _Operator
295 break
296 }
297 s.ungetr()
298 s.tok = _Assign
299
300 case '!':
301 if s.getr() == '=' {
302 s.op, s.prec = Neq, precCmp
303 s.tok = _Operator
304 break
305 }
306 s.ungetr()
307 s.op, s.prec = Not, 0
308 s.tok = _Operator
309
310 default:
311 s.tok = 0
312 s.errorf("invalid character %#U", c)
313 goto redo
314 }
315
316 return
317
318 assignop:
319 if c == '=' {
320 s.tok = _AssignOp
321 return
322 }
323 s.ungetr()
324 s.tok = _Operator
325 }
326
327 func isLetter(c rune) bool {
328 return 'a' <= lower(c) && lower(c) <= 'z' || c == '_'
329 }
330
331 func (s *scanner) ident() {
332 s.startLit()
333
334
335 c := s.getr()
336 for isLetter(c) || isDecimal(c) {
337 c = s.getr()
338 }
339
340
341 if c >= utf8.RuneSelf {
342 for s.isIdentRune(c, false) {
343 c = s.getr()
344 }
345 }
346 s.ungetr()
347
348 lit := s.stopLit()
349
350
351 if len(lit) >= 2 {
352 if tok := keywordMap[hash(lit)]; tok != 0 && tokStrFast(tok) == string(lit) {
353 s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok)
354 s.tok = tok
355 return
356 }
357 }
358
359 s.nlsemi = true
360 s.lit = string(lit)
361 s.tok = _Name
362 }
363
364
365
366 func tokStrFast(tok token) string {
367 return _token_name[_token_index[tok-1]:_token_index[tok]]
368 }
369
370 func (s *scanner) isIdentRune(c rune, first bool) bool {
371 switch {
372 case unicode.IsLetter(c) || c == '_':
373
374 case unicode.IsDigit(c):
375 if first {
376 s.errorf("identifier cannot begin with digit %#U", c)
377 }
378 case c >= utf8.RuneSelf:
379 s.errorf("invalid identifier character %#U", c)
380 default:
381 return false
382 }
383 return true
384 }
385
386
387
388 func hash(s []byte) uint {
389 return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1)
390 }
391
392 var keywordMap [1 << 6]token
393
394 func init() {
395
396 for tok := _Break; tok <= _Var; tok++ {
397 h := hash([]byte(tok.String()))
398 if keywordMap[h] != 0 {
399 panic("imperfect hash")
400 }
401 keywordMap[h] = tok
402 }
403 }
404
405 func lower(c rune) rune { return ('a' - 'A') | c }
406 func isDecimal(c rune) bool { return '0' <= c && c <= '9' }
407 func isHex(c rune) bool { return '0' <= c && c <= '9' || 'a' <= lower(c) && lower(c) <= 'f' }
408
409
410
411
412
413
414
415
416 func (s *scanner) digits(c0 rune, base int, invalid *int) (c rune, digsep int) {
417 c = c0
418 if base <= 10 {
419 max := rune('0' + base)
420 for isDecimal(c) || c == '_' {
421 ds := 1
422 if c == '_' {
423 ds = 2
424 } else if c >= max && *invalid < 0 {
425 *invalid = int(s.col0 - s.col)
426 }
427 digsep |= ds
428 c = s.getr()
429 }
430 } else {
431 for isHex(c) || c == '_' {
432 ds := 1
433 if c == '_' {
434 ds = 2
435 }
436 digsep |= ds
437 c = s.getr()
438 }
439 }
440 return
441 }
442
443 func (s *scanner) number(c rune) {
444 s.startLit()
445
446 base := 10
447 prefix := rune(0)
448 digsep := 0
449 invalid := -1
450
451
452 var ds int
453 if c != '.' {
454 s.kind = IntLit
455 if c == '0' {
456 c = s.getr()
457 switch lower(c) {
458 case 'x':
459 c = s.getr()
460 base, prefix = 16, 'x'
461 case 'o':
462 c = s.getr()
463 base, prefix = 8, 'o'
464 case 'b':
465 c = s.getr()
466 base, prefix = 2, 'b'
467 default:
468 base, prefix = 8, '0'
469 digsep = 1
470 }
471 }
472 c, ds = s.digits(c, base, &invalid)
473 digsep |= ds
474 }
475
476
477 if c == '.' {
478 s.kind = FloatLit
479 if prefix == 'o' || prefix == 'b' {
480 s.error("invalid radix point in " + litname(prefix))
481 }
482 c, ds = s.digits(s.getr(), base, &invalid)
483 digsep |= ds
484 }
485
486 if digsep&1 == 0 {
487 s.error(litname(prefix) + " has no digits")
488 }
489
490
491 if e := lower(c); e == 'e' || e == 'p' {
492 switch {
493 case e == 'e' && prefix != 0 && prefix != '0':
494 s.errorf("%q exponent requires decimal mantissa", c)
495 case e == 'p' && prefix != 'x':
496 s.errorf("%q exponent requires hexadecimal mantissa", c)
497 }
498 c = s.getr()
499 s.kind = FloatLit
500 if c == '+' || c == '-' {
501 c = s.getr()
502 }
503 c, ds = s.digits(c, 10, nil)
504 digsep |= ds
505 if ds&1 == 0 {
506 s.error("exponent has no digits")
507 }
508 } else if prefix == 'x' && s.kind == FloatLit {
509 s.error("hexadecimal mantissa requires a 'p' exponent")
510 }
511
512
513 if c == 'i' {
514 s.kind = ImagLit
515 c = s.getr()
516 }
517 s.ungetr()
518
519 s.nlsemi = true
520 s.lit = string(s.stopLit())
521 s.tok = _Literal
522
523 if s.kind == IntLit && invalid >= 0 {
524 s.errh(s.line, s.col+uint(invalid), fmt.Sprintf("invalid digit %q in %s", s.lit[invalid], litname(prefix)))
525 }
526
527 if digsep&2 != 0 {
528 if i := invalidSep(s.lit); i >= 0 {
529 s.errh(s.line, s.col+uint(i), "'_' must separate successive digits")
530 }
531 }
532 }
533
534 func litname(prefix rune) string {
535 switch prefix {
536 case 'x':
537 return "hexadecimal literal"
538 case 'o', '0':
539 return "octal literal"
540 case 'b':
541 return "binary literal"
542 }
543 return "decimal literal"
544 }
545
546
547 func invalidSep(x string) int {
548 x1 := ' '
549 d := '.'
550 i := 0
551
552
553 if len(x) >= 2 && x[0] == '0' {
554 x1 = lower(rune(x[1]))
555 if x1 == 'x' || x1 == 'o' || x1 == 'b' {
556 d = '0'
557 i = 2
558 }
559 }
560
561
562 for ; i < len(x); i++ {
563 p := d
564 d = rune(x[i])
565 switch {
566 case d == '_':
567 if p != '0' {
568 return i
569 }
570 case isDecimal(d) || x1 == 'x' && isHex(d):
571 d = '0'
572 default:
573 if p == '_' {
574 return i - 1
575 }
576 d = '.'
577 }
578 }
579 if d == '_' {
580 return len(x) - 1
581 }
582
583 return -1
584 }
585
586 func (s *scanner) rune() {
587 s.startLit()
588
589 ok := true
590 n := 0
591 for ; ; n++ {
592 r := s.getr()
593 if r == '\'' {
594 break
595 }
596 if r == '\\' {
597 if !s.escape('\'') {
598 ok = false
599 }
600 continue
601 }
602 if r == '\n' {
603 s.ungetr()
604 if ok {
605 s.error("newline in character literal")
606 ok = false
607 }
608 break
609 }
610 if r < 0 {
611 if ok {
612 s.errh(s.line, s.col, "invalid character literal (missing closing ')")
613 ok = false
614 }
615 break
616 }
617 }
618
619 if ok {
620 if n == 0 {
621 s.error("empty character literal or unescaped ' in character literal")
622 } else if n != 1 {
623 s.errh(s.line, s.col, "invalid character literal (more than one character)")
624 }
625 }
626
627 s.nlsemi = true
628 s.lit = string(s.stopLit())
629 s.kind = RuneLit
630 s.tok = _Literal
631 }
632
633 func (s *scanner) stdString() {
634 s.startLit()
635
636 for {
637 r := s.getr()
638 if r == '"' {
639 break
640 }
641 if r == '\\' {
642 s.escape('"')
643 continue
644 }
645 if r == '\n' {
646 s.ungetr()
647 s.error("newline in string")
648 break
649 }
650 if r < 0 {
651 s.errh(s.line, s.col, "string not terminated")
652 break
653 }
654 }
655
656 s.nlsemi = true
657 s.lit = string(s.stopLit())
658 s.kind = StringLit
659 s.tok = _Literal
660 }
661
662 func (s *scanner) rawString() {
663 s.startLit()
664
665 for {
666 r := s.getr()
667 if r == '`' {
668 break
669 }
670 if r < 0 {
671 s.errh(s.line, s.col, "string not terminated")
672 break
673 }
674 }
675
676
677
678
679 s.nlsemi = true
680 s.lit = string(s.stopLit())
681 s.kind = StringLit
682 s.tok = _Literal
683 }
684
685 func (s *scanner) comment(text string) {
686 s.errh(s.line, s.col, text)
687 }
688
689 func (s *scanner) skipLine(r rune) {
690 for r >= 0 {
691 if r == '\n' {
692 s.ungetr()
693 break
694 }
695 r = s.getr()
696 }
697 }
698
699 func (s *scanner) lineComment() {
700 r := s.getr()
701
702 if s.mode&comments != 0 {
703 s.startLit()
704 s.skipLine(r)
705 s.comment("//" + string(s.stopLit()))
706 return
707 }
708
709
710 if s.mode&directives == 0 || s.col != colbase || (r != 'g' && r != 'l') {
711 s.skipLine(r)
712 return
713 }
714
715
716 prefix := "go:"
717 if r == 'l' {
718 prefix = "line "
719 }
720 for _, m := range prefix {
721 if r != m {
722 s.skipLine(r)
723 return
724 }
725 r = s.getr()
726 }
727
728
729 s.startLit()
730 s.skipLine(r)
731 s.comment("//" + prefix + string(s.stopLit()))
732 }
733
734 func (s *scanner) skipComment(r rune) bool {
735 for r >= 0 {
736 for r == '*' {
737 r = s.getr()
738 if r == '/' {
739 return true
740 }
741 }
742 r = s.getr()
743 }
744 s.errh(s.line, s.col, "comment not terminated")
745 return false
746 }
747
748 func (s *scanner) fullComment() {
749 r := s.getr()
750
751 if s.mode&comments != 0 {
752 s.startLit()
753 if s.skipComment(r) {
754 s.comment("/*" + string(s.stopLit()))
755 } else {
756 s.killLit()
757 }
758 return
759 }
760
761 if s.mode&directives == 0 || r != 'l' {
762 s.skipComment(r)
763 return
764 }
765
766
767 const prefix = "line "
768 for _, m := range prefix {
769 if r != m {
770 s.skipComment(r)
771 return
772 }
773 r = s.getr()
774 }
775
776
777 s.startLit()
778 if s.skipComment(r) {
779 s.comment("/*" + prefix + string(s.stopLit()))
780 } else {
781 s.killLit()
782 }
783 }
784
785 func (s *scanner) escape(quote rune) bool {
786 var n int
787 var base, max uint32
788
789 c := s.getr()
790 switch c {
791 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
792 return true
793 case '0', '1', '2', '3', '4', '5', '6', '7':
794 n, base, max = 3, 8, 255
795 case 'x':
796 c = s.getr()
797 n, base, max = 2, 16, 255
798 case 'u':
799 c = s.getr()
800 n, base, max = 4, 16, unicode.MaxRune
801 case 'U':
802 c = s.getr()
803 n, base, max = 8, 16, unicode.MaxRune
804 default:
805 if c < 0 {
806 return true
807 }
808 s.error("unknown escape sequence")
809 return false
810 }
811
812 var x uint32
813 for i := n; i > 0; i-- {
814 d := base
815 switch {
816 case isDecimal(c):
817 d = uint32(c) - '0'
818 case 'a' <= lower(c) && lower(c) <= 'f':
819 d = uint32(lower(c)) - ('a' - 10)
820 }
821 if d >= base {
822 if c < 0 {
823 return true
824 }
825 kind := "hex"
826 if base == 8 {
827 kind = "octal"
828 }
829 s.errorf("non-%s character in escape sequence: %c", kind, c)
830 s.ungetr()
831 return false
832 }
833
834 x = x*base + d
835 c = s.getr()
836 }
837 s.ungetr()
838
839 if x > max && base == 8 {
840 s.errorf("octal escape value > 255: %d", x)
841 return false
842 }
843
844 if x > max || 0xD800 <= x && x < 0xE000 {
845 s.error("escape sequence is invalid Unicode code point")
846 return false
847 }
848
849 return true
850 }
851
View as plain text