Source file src/pkg/cmd/compile/internal/syntax/scanner.go

     1	// Copyright 2016 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// This file implements scanner, a lexical tokenizer for
     6	// Go source. After initialization, consecutive calls of
     7	// next advance the scanner one token at a time.
     8	//
     9	// This file, source.go, and tokens.go are self-contained
    10	// (go tool compile scanner.go source.go tokens.go compiles)
    11	// and thus could be made into its own package.
    12	
    13	package syntax
    14	
    15	import (
    16		"fmt"
    17		"io"
    18		"unicode"
    19		"unicode/utf8"
    20	)
    21	
    22	// The mode flags below control which comments are reported
    23	// by calling the error handler. If no flag is set, comments
    24	// are ignored.
    25	const (
    26		comments   uint = 1 << iota // call handler for all comments
    27		directives                  // call handler for directives only
    28	)
    29	
    30	type scanner struct {
    31		source
    32		mode   uint
    33		nlsemi bool // if set '\n' and EOF translate to ';'
    34	
    35		// current token, valid after calling next()
    36		line, col uint
    37		tok       token
    38		lit       string   // valid if tok is _Name, _Literal, or _Semi ("semicolon", "newline", or "EOF")
    39		kind      LitKind  // valid if tok is _Literal
    40		op        Operator // valid if tok is _Operator, _AssignOp, or _IncOp
    41		prec      int      // valid if tok is _Operator, _AssignOp, or _IncOp
    42	}
    43	
    44	func (s *scanner) init(src io.Reader, errh func(line, col uint, msg string), mode uint) {
    45		s.source.init(src, errh)
    46		s.mode = mode
    47		s.nlsemi = false
    48	}
    49	
    50	func (s *scanner) errorf(format string, args ...interface{}) {
    51		s.error(fmt.Sprintf(format, args...))
    52	}
    53	
    54	// next advances the scanner by reading the next token.
    55	//
    56	// If a read, source encoding, or lexical error occurs, next calls
    57	// the installed error handler with the respective error position
    58	// and message. The error message is guaranteed to be non-empty and
    59	// never starts with a '/'. The error handler must exist.
    60	//
    61	// If the scanner mode includes the comments flag and a comment
    62	// (including comments containing directives) is encountered, the
    63	// error handler is also called with each comment position and text
    64	// (including opening /* or // and closing */, but without a newline
    65	// at the end of line comments). Comment text always starts with a /
    66	// which can be used to distinguish these handler calls from errors.
    67	//
    68	// If the scanner mode includes the directives (but not the comments)
    69	// flag, only comments containing a //line, /*line, or //go: directive
    70	// are reported, in the same way as regular comments. Directives in
    71	// //-style comments are only recognized if they are at the beginning
    72	// of a line.
    73	//
    74	func (s *scanner) next() {
    75		nlsemi := s.nlsemi
    76		s.nlsemi = false
    77	
    78	redo:
    79		// skip white space
    80		c := s.getr()
    81		for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' {
    82			c = s.getr()
    83		}
    84	
    85		// token start
    86		s.line, s.col = s.source.line0, s.source.col0
    87	
    88		if isLetter(c) || c >= utf8.RuneSelf && s.isIdentRune(c, true) {
    89			s.ident()
    90			return
    91		}
    92	
    93		switch c {
    94		case -1:
    95			if nlsemi {
    96				s.lit = "EOF"
    97				s.tok = _Semi
    98				break
    99			}
   100			s.tok = _EOF
   101	
   102		case '\n':
   103			s.lit = "newline"
   104			s.tok = _Semi
   105	
   106		case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   107			s.number(c)
   108	
   109		case '"':
   110			s.stdString()
   111	
   112		case '`':
   113			s.rawString()
   114	
   115		case '\'':
   116			s.rune()
   117	
   118		case '(':
   119			s.tok = _Lparen
   120	
   121		case '[':
   122			s.tok = _Lbrack
   123	
   124		case '{':
   125			s.tok = _Lbrace
   126	
   127		case ',':
   128			s.tok = _Comma
   129	
   130		case ';':
   131			s.lit = "semicolon"
   132			s.tok = _Semi
   133	
   134		case ')':
   135			s.nlsemi = true
   136			s.tok = _Rparen
   137	
   138		case ']':
   139			s.nlsemi = true
   140			s.tok = _Rbrack
   141	
   142		case '}':
   143			s.nlsemi = true
   144			s.tok = _Rbrace
   145	
   146		case ':':
   147			if s.getr() == '=' {
   148				s.tok = _Define
   149				break
   150			}
   151			s.ungetr()
   152			s.tok = _Colon
   153	
   154		case '.':
   155			c = s.getr()
   156			if isDecimal(c) {
   157				s.ungetr()
   158				s.unread(1) // correct position of '.' (needed by startLit in number)
   159				s.number('.')
   160				break
   161			}
   162			if c == '.' {
   163				c = s.getr()
   164				if c == '.' {
   165					s.tok = _DotDotDot
   166					break
   167				}
   168				s.unread(1)
   169			}
   170			s.ungetr()
   171			s.tok = _Dot
   172	
   173		case '+':
   174			s.op, s.prec = Add, precAdd
   175			c = s.getr()
   176			if c != '+' {
   177				goto assignop
   178			}
   179			s.nlsemi = true
   180			s.tok = _IncOp
   181	
   182		case '-':
   183			s.op, s.prec = Sub, precAdd
   184			c = s.getr()
   185			if c != '-' {
   186				goto assignop
   187			}
   188			s.nlsemi = true
   189			s.tok = _IncOp
   190	
   191		case '*':
   192			s.op, s.prec = Mul, precMul
   193			// don't goto assignop - want _Star token
   194			if s.getr() == '=' {
   195				s.tok = _AssignOp
   196				break
   197			}
   198			s.ungetr()
   199			s.tok = _Star
   200	
   201		case '/':
   202			c = s.getr()
   203			if c == '/' {
   204				s.lineComment()
   205				goto redo
   206			}
   207			if c == '*' {
   208				s.fullComment()
   209				if s.source.line > s.line && nlsemi {
   210					// A multi-line comment acts like a newline;
   211					// it translates to a ';' if nlsemi is set.
   212					s.lit = "newline"
   213					s.tok = _Semi
   214					break
   215				}
   216				goto redo
   217			}
   218			s.op, s.prec = Div, precMul
   219			goto assignop
   220	
   221		case '%':
   222			s.op, s.prec = Rem, precMul
   223			c = s.getr()
   224			goto assignop
   225	
   226		case '&':
   227			c = s.getr()
   228			if c == '&' {
   229				s.op, s.prec = AndAnd, precAndAnd
   230				s.tok = _Operator
   231				break
   232			}
   233			s.op, s.prec = And, precMul
   234			if c == '^' {
   235				s.op = AndNot
   236				c = s.getr()
   237			}
   238			goto assignop
   239	
   240		case '|':
   241			c = s.getr()
   242			if c == '|' {
   243				s.op, s.prec = OrOr, precOrOr
   244				s.tok = _Operator
   245				break
   246			}
   247			s.op, s.prec = Or, precAdd
   248			goto assignop
   249	
   250		case '^':
   251			s.op, s.prec = Xor, precAdd
   252			c = s.getr()
   253			goto assignop
   254	
   255		case '<':
   256			c = s.getr()
   257			if c == '=' {
   258				s.op, s.prec = Leq, precCmp
   259				s.tok = _Operator
   260				break
   261			}
   262			if c == '<' {
   263				s.op, s.prec = Shl, precMul
   264				c = s.getr()
   265				goto assignop
   266			}
   267			if c == '-' {
   268				s.tok = _Arrow
   269				break
   270			}
   271			s.ungetr()
   272			s.op, s.prec = Lss, precCmp
   273			s.tok = _Operator
   274	
   275		case '>':
   276			c = s.getr()
   277			if c == '=' {
   278				s.op, s.prec = Geq, precCmp
   279				s.tok = _Operator
   280				break
   281			}
   282			if c == '>' {
   283				s.op, s.prec = Shr, precMul
   284				c = s.getr()
   285				goto assignop
   286			}
   287			s.ungetr()
   288			s.op, s.prec = Gtr, precCmp
   289			s.tok = _Operator
   290	
   291		case '=':
   292			if s.getr() == '=' {
   293				s.op, s.prec = Eql, precCmp
   294				s.tok = _Operator
   295				break
   296			}
   297			s.ungetr()
   298			s.tok = _Assign
   299	
   300		case '!':
   301			if s.getr() == '=' {
   302				s.op, s.prec = Neq, precCmp
   303				s.tok = _Operator
   304				break
   305			}
   306			s.ungetr()
   307			s.op, s.prec = Not, 0
   308			s.tok = _Operator
   309	
   310		default:
   311			s.tok = 0
   312			s.errorf("invalid character %#U", c)
   313			goto redo
   314		}
   315	
   316		return
   317	
   318	assignop:
   319		if c == '=' {
   320			s.tok = _AssignOp
   321			return
   322		}
   323		s.ungetr()
   324		s.tok = _Operator
   325	}
   326	
   327	func isLetter(c rune) bool {
   328		return 'a' <= lower(c) && lower(c) <= 'z' || c == '_'
   329	}
   330	
   331	func (s *scanner) ident() {
   332		s.startLit()
   333	
   334		// accelerate common case (7bit ASCII)
   335		c := s.getr()
   336		for isLetter(c) || isDecimal(c) {
   337			c = s.getr()
   338		}
   339	
   340		// general case
   341		if c >= utf8.RuneSelf {
   342			for s.isIdentRune(c, false) {
   343				c = s.getr()
   344			}
   345		}
   346		s.ungetr()
   347	
   348		lit := s.stopLit()
   349	
   350		// possibly a keyword
   351		if len(lit) >= 2 {
   352			if tok := keywordMap[hash(lit)]; tok != 0 && tokStrFast(tok) == string(lit) {
   353				s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok)
   354				s.tok = tok
   355				return
   356			}
   357		}
   358	
   359		s.nlsemi = true
   360		s.lit = string(lit)
   361		s.tok = _Name
   362	}
   363	
   364	// tokStrFast is a faster version of token.String, which assumes that tok
   365	// is one of the valid tokens - and can thus skip bounds checks.
   366	func tokStrFast(tok token) string {
   367		return _token_name[_token_index[tok-1]:_token_index[tok]]
   368	}
   369	
   370	func (s *scanner) isIdentRune(c rune, first bool) bool {
   371		switch {
   372		case unicode.IsLetter(c) || c == '_':
   373			// ok
   374		case unicode.IsDigit(c):
   375			if first {
   376				s.errorf("identifier cannot begin with digit %#U", c)
   377			}
   378		case c >= utf8.RuneSelf:
   379			s.errorf("invalid identifier character %#U", c)
   380		default:
   381			return false
   382		}
   383		return true
   384	}
   385	
   386	// hash is a perfect hash function for keywords.
   387	// It assumes that s has at least length 2.
   388	func hash(s []byte) uint {
   389		return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1)
   390	}
   391	
   392	var keywordMap [1 << 6]token // size must be power of two
   393	
   394	func init() {
   395		// populate keywordMap
   396		for tok := _Break; tok <= _Var; tok++ {
   397			h := hash([]byte(tok.String()))
   398			if keywordMap[h] != 0 {
   399				panic("imperfect hash")
   400			}
   401			keywordMap[h] = tok
   402		}
   403	}
   404	
   405	func lower(c rune) rune     { return ('a' - 'A') | c } // returns lower-case c iff c is ASCII letter
   406	func isDecimal(c rune) bool { return '0' <= c && c <= '9' }
   407	func isHex(c rune) bool     { return '0' <= c && c <= '9' || 'a' <= lower(c) && lower(c) <= 'f' }
   408	
   409	// digits accepts the sequence { digit | '_' } starting with c0.
   410	// If base <= 10, digits accepts any decimal digit but records
   411	// the index (relative to the literal start) of a digit >= base
   412	// in *invalid, if *invalid < 0.
   413	// digits returns the first rune that is not part of the sequence
   414	// anymore, and a bitset describing whether the sequence contained
   415	// digits (bit 0 is set), or separators '_' (bit 1 is set).
   416	func (s *scanner) digits(c0 rune, base int, invalid *int) (c rune, digsep int) {
   417		c = c0
   418		if base <= 10 {
   419			max := rune('0' + base)
   420			for isDecimal(c) || c == '_' {
   421				ds := 1
   422				if c == '_' {
   423					ds = 2
   424				} else if c >= max && *invalid < 0 {
   425					*invalid = int(s.col0 - s.col) // record invalid rune index
   426				}
   427				digsep |= ds
   428				c = s.getr()
   429			}
   430		} else {
   431			for isHex(c) || c == '_' {
   432				ds := 1
   433				if c == '_' {
   434					ds = 2
   435				}
   436				digsep |= ds
   437				c = s.getr()
   438			}
   439		}
   440		return
   441	}
   442	
   443	func (s *scanner) number(c rune) {
   444		s.startLit()
   445	
   446		base := 10        // number base
   447		prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
   448		digsep := 0       // bit 0: digit present, bit 1: '_' present
   449		invalid := -1     // index of invalid digit in literal, or < 0
   450	
   451		// integer part
   452		var ds int
   453		if c != '.' {
   454			s.kind = IntLit
   455			if c == '0' {
   456				c = s.getr()
   457				switch lower(c) {
   458				case 'x':
   459					c = s.getr()
   460					base, prefix = 16, 'x'
   461				case 'o':
   462					c = s.getr()
   463					base, prefix = 8, 'o'
   464				case 'b':
   465					c = s.getr()
   466					base, prefix = 2, 'b'
   467				default:
   468					base, prefix = 8, '0'
   469					digsep = 1 // leading 0
   470				}
   471			}
   472			c, ds = s.digits(c, base, &invalid)
   473			digsep |= ds
   474		}
   475	
   476		// fractional part
   477		if c == '.' {
   478			s.kind = FloatLit
   479			if prefix == 'o' || prefix == 'b' {
   480				s.error("invalid radix point in " + litname(prefix))
   481			}
   482			c, ds = s.digits(s.getr(), base, &invalid)
   483			digsep |= ds
   484		}
   485	
   486		if digsep&1 == 0 {
   487			s.error(litname(prefix) + " has no digits")
   488		}
   489	
   490		// exponent
   491		if e := lower(c); e == 'e' || e == 'p' {
   492			switch {
   493			case e == 'e' && prefix != 0 && prefix != '0':
   494				s.errorf("%q exponent requires decimal mantissa", c)
   495			case e == 'p' && prefix != 'x':
   496				s.errorf("%q exponent requires hexadecimal mantissa", c)
   497			}
   498			c = s.getr()
   499			s.kind = FloatLit
   500			if c == '+' || c == '-' {
   501				c = s.getr()
   502			}
   503			c, ds = s.digits(c, 10, nil)
   504			digsep |= ds
   505			if ds&1 == 0 {
   506				s.error("exponent has no digits")
   507			}
   508		} else if prefix == 'x' && s.kind == FloatLit {
   509			s.error("hexadecimal mantissa requires a 'p' exponent")
   510		}
   511	
   512		// suffix 'i'
   513		if c == 'i' {
   514			s.kind = ImagLit
   515			c = s.getr()
   516		}
   517		s.ungetr()
   518	
   519		s.nlsemi = true
   520		s.lit = string(s.stopLit())
   521		s.tok = _Literal
   522	
   523		if s.kind == IntLit && invalid >= 0 {
   524			s.errh(s.line, s.col+uint(invalid), fmt.Sprintf("invalid digit %q in %s", s.lit[invalid], litname(prefix)))
   525		}
   526	
   527		if digsep&2 != 0 {
   528			if i := invalidSep(s.lit); i >= 0 {
   529				s.errh(s.line, s.col+uint(i), "'_' must separate successive digits")
   530			}
   531		}
   532	}
   533	
   534	func litname(prefix rune) string {
   535		switch prefix {
   536		case 'x':
   537			return "hexadecimal literal"
   538		case 'o', '0':
   539			return "octal literal"
   540		case 'b':
   541			return "binary literal"
   542		}
   543		return "decimal literal"
   544	}
   545	
   546	// invalidSep returns the index of the first invalid separator in x, or -1.
   547	func invalidSep(x string) int {
   548		x1 := ' ' // prefix char, we only care if it's 'x'
   549		d := '.'  // digit, one of '_', '0' (a digit), or '.' (anything else)
   550		i := 0
   551	
   552		// a prefix counts as a digit
   553		if len(x) >= 2 && x[0] == '0' {
   554			x1 = lower(rune(x[1]))
   555			if x1 == 'x' || x1 == 'o' || x1 == 'b' {
   556				d = '0'
   557				i = 2
   558			}
   559		}
   560	
   561		// mantissa and exponent
   562		for ; i < len(x); i++ {
   563			p := d // previous digit
   564			d = rune(x[i])
   565			switch {
   566			case d == '_':
   567				if p != '0' {
   568					return i
   569				}
   570			case isDecimal(d) || x1 == 'x' && isHex(d):
   571				d = '0'
   572			default:
   573				if p == '_' {
   574					return i - 1
   575				}
   576				d = '.'
   577			}
   578		}
   579		if d == '_' {
   580			return len(x) - 1
   581		}
   582	
   583		return -1
   584	}
   585	
   586	func (s *scanner) rune() {
   587		s.startLit()
   588	
   589		ok := true // only report errors if we're ok so far
   590		n := 0
   591		for ; ; n++ {
   592			r := s.getr()
   593			if r == '\'' {
   594				break
   595			}
   596			if r == '\\' {
   597				if !s.escape('\'') {
   598					ok = false
   599				}
   600				continue
   601			}
   602			if r == '\n' {
   603				s.ungetr() // assume newline is not part of literal
   604				if ok {
   605					s.error("newline in character literal")
   606					ok = false
   607				}
   608				break
   609			}
   610			if r < 0 {
   611				if ok {
   612					s.errh(s.line, s.col, "invalid character literal (missing closing ')")
   613					ok = false
   614				}
   615				break
   616			}
   617		}
   618	
   619		if ok {
   620			if n == 0 {
   621				s.error("empty character literal or unescaped ' in character literal")
   622			} else if n != 1 {
   623				s.errh(s.line, s.col, "invalid character literal (more than one character)")
   624			}
   625		}
   626	
   627		s.nlsemi = true
   628		s.lit = string(s.stopLit())
   629		s.kind = RuneLit
   630		s.tok = _Literal
   631	}
   632	
   633	func (s *scanner) stdString() {
   634		s.startLit()
   635	
   636		for {
   637			r := s.getr()
   638			if r == '"' {
   639				break
   640			}
   641			if r == '\\' {
   642				s.escape('"')
   643				continue
   644			}
   645			if r == '\n' {
   646				s.ungetr() // assume newline is not part of literal
   647				s.error("newline in string")
   648				break
   649			}
   650			if r < 0 {
   651				s.errh(s.line, s.col, "string not terminated")
   652				break
   653			}
   654		}
   655	
   656		s.nlsemi = true
   657		s.lit = string(s.stopLit())
   658		s.kind = StringLit
   659		s.tok = _Literal
   660	}
   661	
   662	func (s *scanner) rawString() {
   663		s.startLit()
   664	
   665		for {
   666			r := s.getr()
   667			if r == '`' {
   668				break
   669			}
   670			if r < 0 {
   671				s.errh(s.line, s.col, "string not terminated")
   672				break
   673			}
   674		}
   675		// We leave CRs in the string since they are part of the
   676		// literal (even though they are not part of the literal
   677		// value).
   678	
   679		s.nlsemi = true
   680		s.lit = string(s.stopLit())
   681		s.kind = StringLit
   682		s.tok = _Literal
   683	}
   684	
   685	func (s *scanner) comment(text string) {
   686		s.errh(s.line, s.col, text)
   687	}
   688	
   689	func (s *scanner) skipLine(r rune) {
   690		for r >= 0 {
   691			if r == '\n' {
   692				s.ungetr() // don't consume '\n' - needed for nlsemi logic
   693				break
   694			}
   695			r = s.getr()
   696		}
   697	}
   698	
   699	func (s *scanner) lineComment() {
   700		r := s.getr()
   701	
   702		if s.mode&comments != 0 {
   703			s.startLit()
   704			s.skipLine(r)
   705			s.comment("//" + string(s.stopLit()))
   706			return
   707		}
   708	
   709		// directives must start at the beginning of the line (s.col == colbase)
   710		if s.mode&directives == 0 || s.col != colbase || (r != 'g' && r != 'l') {
   711			s.skipLine(r)
   712			return
   713		}
   714	
   715		// recognize go: or line directives
   716		prefix := "go:"
   717		if r == 'l' {
   718			prefix = "line "
   719		}
   720		for _, m := range prefix {
   721			if r != m {
   722				s.skipLine(r)
   723				return
   724			}
   725			r = s.getr()
   726		}
   727	
   728		// directive text
   729		s.startLit()
   730		s.skipLine(r)
   731		s.comment("//" + prefix + string(s.stopLit()))
   732	}
   733	
   734	func (s *scanner) skipComment(r rune) bool {
   735		for r >= 0 {
   736			for r == '*' {
   737				r = s.getr()
   738				if r == '/' {
   739					return true
   740				}
   741			}
   742			r = s.getr()
   743		}
   744		s.errh(s.line, s.col, "comment not terminated")
   745		return false
   746	}
   747	
   748	func (s *scanner) fullComment() {
   749		r := s.getr()
   750	
   751		if s.mode&comments != 0 {
   752			s.startLit()
   753			if s.skipComment(r) {
   754				s.comment("/*" + string(s.stopLit()))
   755			} else {
   756				s.killLit() // not a complete comment - ignore
   757			}
   758			return
   759		}
   760	
   761		if s.mode&directives == 0 || r != 'l' {
   762			s.skipComment(r)
   763			return
   764		}
   765	
   766		// recognize line directive
   767		const prefix = "line "
   768		for _, m := range prefix {
   769			if r != m {
   770				s.skipComment(r)
   771				return
   772			}
   773			r = s.getr()
   774		}
   775	
   776		// directive text
   777		s.startLit()
   778		if s.skipComment(r) {
   779			s.comment("/*" + prefix + string(s.stopLit()))
   780		} else {
   781			s.killLit() // not a complete comment - ignore
   782		}
   783	}
   784	
   785	func (s *scanner) escape(quote rune) bool {
   786		var n int
   787		var base, max uint32
   788	
   789		c := s.getr()
   790		switch c {
   791		case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   792			return true
   793		case '0', '1', '2', '3', '4', '5', '6', '7':
   794			n, base, max = 3, 8, 255
   795		case 'x':
   796			c = s.getr()
   797			n, base, max = 2, 16, 255
   798		case 'u':
   799			c = s.getr()
   800			n, base, max = 4, 16, unicode.MaxRune
   801		case 'U':
   802			c = s.getr()
   803			n, base, max = 8, 16, unicode.MaxRune
   804		default:
   805			if c < 0 {
   806				return true // complain in caller about EOF
   807			}
   808			s.error("unknown escape sequence")
   809			return false
   810		}
   811	
   812		var x uint32
   813		for i := n; i > 0; i-- {
   814			d := base
   815			switch {
   816			case isDecimal(c):
   817				d = uint32(c) - '0'
   818			case 'a' <= lower(c) && lower(c) <= 'f':
   819				d = uint32(lower(c)) - ('a' - 10)
   820			}
   821			if d >= base {
   822				if c < 0 {
   823					return true // complain in caller about EOF
   824				}
   825				kind := "hex"
   826				if base == 8 {
   827					kind = "octal"
   828				}
   829				s.errorf("non-%s character in escape sequence: %c", kind, c)
   830				s.ungetr()
   831				return false
   832			}
   833			// d < base
   834			x = x*base + d
   835			c = s.getr()
   836		}
   837		s.ungetr()
   838	
   839		if x > max && base == 8 {
   840			s.errorf("octal escape value > 255: %d", x)
   841			return false
   842		}
   843	
   844		if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ {
   845			s.error("escape sequence is invalid Unicode code point")
   846			return false
   847		}
   848	
   849		return true
   850	}
   851
View as plain text