...

Source file src/go/scanner/scanner.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package scanner implements a scanner for Go source text.
     6	// It takes a []byte as source which can then be tokenized
     7	// through repeated calls to the Scan method.
     8	//
     9	package scanner
    10	
    11	import (
    12		"bytes"
    13		"fmt"
    14		"go/token"
    15		"path/filepath"
    16		"strconv"
    17		"unicode"
    18		"unicode/utf8"
    19	)
    20	
    21	// An ErrorHandler may be provided to Scanner.Init. If a syntax error is
    22	// encountered and a handler was installed, the handler is called with a
    23	// position and an error message. The position points to the beginning of
    24	// the offending token.
    25	//
    26	type ErrorHandler func(pos token.Position, msg string)
    27	
    28	// A Scanner holds the scanner's internal state while processing
    29	// a given text. It can be allocated as part of another data
    30	// structure but must be initialized via Init before use.
    31	//
    32	type Scanner struct {
    33		// immutable state
    34		file *token.File  // source file handle
    35		dir  string       // directory portion of file.Name()
    36		src  []byte       // source
    37		err  ErrorHandler // error reporting; or nil
    38		mode Mode         // scanning mode
    39	
    40		// scanning state
    41		ch         rune // current character
    42		offset     int  // character offset
    43		rdOffset   int  // reading offset (position after current character)
    44		lineOffset int  // current line offset
    45		insertSemi bool // insert a semicolon before next newline
    46	
    47		// public state - ok to modify
    48		ErrorCount int // number of errors encountered
    49	}
    50	
    51	const bom = 0xFEFF // byte order mark, only permitted as very first character
    52	
    53	// Read the next Unicode char into s.ch.
    54	// s.ch < 0 means end-of-file.
    55	//
    56	func (s *Scanner) next() {
    57		if s.rdOffset < len(s.src) {
    58			s.offset = s.rdOffset
    59			if s.ch == '\n' {
    60				s.lineOffset = s.offset
    61				s.file.AddLine(s.offset)
    62			}
    63			r, w := rune(s.src[s.rdOffset]), 1
    64			switch {
    65			case r == 0:
    66				s.error(s.offset, "illegal character NUL")
    67			case r >= utf8.RuneSelf:
    68				// not ASCII
    69				r, w = utf8.DecodeRune(s.src[s.rdOffset:])
    70				if r == utf8.RuneError && w == 1 {
    71					s.error(s.offset, "illegal UTF-8 encoding")
    72				} else if r == bom && s.offset > 0 {
    73					s.error(s.offset, "illegal byte order mark")
    74				}
    75			}
    76			s.rdOffset += w
    77			s.ch = r
    78		} else {
    79			s.offset = len(s.src)
    80			if s.ch == '\n' {
    81				s.lineOffset = s.offset
    82				s.file.AddLine(s.offset)
    83			}
    84			s.ch = -1 // eof
    85		}
    86	}
    87	
    88	// peek returns the byte following the most recently read character without
    89	// advancing the scanner. If the scanner is at EOF, peek returns 0.
    90	func (s *Scanner) peek() byte {
    91		if s.rdOffset < len(s.src) {
    92			return s.src[s.rdOffset]
    93		}
    94		return 0
    95	}
    96	
    97	// A mode value is a set of flags (or 0).
    98	// They control scanner behavior.
    99	//
   100	type Mode uint
   101	
   102	const (
   103		ScanComments    Mode = 1 << iota // return comments as COMMENT tokens
   104		dontInsertSemis                  // do not automatically insert semicolons - for testing only
   105	)
   106	
   107	// Init prepares the scanner s to tokenize the text src by setting the
   108	// scanner at the beginning of src. The scanner uses the file set file
   109	// for position information and it adds line information for each line.
   110	// It is ok to re-use the same file when re-scanning the same file as
   111	// line information which is already present is ignored. Init causes a
   112	// panic if the file size does not match the src size.
   113	//
   114	// Calls to Scan will invoke the error handler err if they encounter a
   115	// syntax error and err is not nil. Also, for each error encountered,
   116	// the Scanner field ErrorCount is incremented by one. The mode parameter
   117	// determines how comments are handled.
   118	//
   119	// Note that Init may call err if there is an error in the first character
   120	// of the file.
   121	//
   122	func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
   123		// Explicitly initialize all fields since a scanner may be reused.
   124		if file.Size() != len(src) {
   125			panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
   126		}
   127		s.file = file
   128		s.dir, _ = filepath.Split(file.Name())
   129		s.src = src
   130		s.err = err
   131		s.mode = mode
   132	
   133		s.ch = ' '
   134		s.offset = 0
   135		s.rdOffset = 0
   136		s.lineOffset = 0
   137		s.insertSemi = false
   138		s.ErrorCount = 0
   139	
   140		s.next()
   141		if s.ch == bom {
   142			s.next() // ignore BOM at file beginning
   143		}
   144	}
   145	
   146	func (s *Scanner) error(offs int, msg string) {
   147		if s.err != nil {
   148			s.err(s.file.Position(s.file.Pos(offs)), msg)
   149		}
   150		s.ErrorCount++
   151	}
   152	
   153	func (s *Scanner) errorf(offs int, format string, args ...interface{}) {
   154		s.error(offs, fmt.Sprintf(format, args...))
   155	}
   156	
   157	func (s *Scanner) scanComment() string {
   158		// initial '/' already consumed; s.ch == '/' || s.ch == '*'
   159		offs := s.offset - 1 // position of initial '/'
   160		next := -1           // position immediately following the comment; < 0 means invalid comment
   161		numCR := 0
   162	
   163		if s.ch == '/' {
   164			//-style comment
   165			// (the final '\n' is not considered part of the comment)
   166			s.next()
   167			for s.ch != '\n' && s.ch >= 0 {
   168				if s.ch == '\r' {
   169					numCR++
   170				}
   171				s.next()
   172			}
   173			// if we are at '\n', the position following the comment is afterwards
   174			next = s.offset
   175			if s.ch == '\n' {
   176				next++
   177			}
   178			goto exit
   179		}
   180	
   181		/*-style comment */
   182		s.next()
   183		for s.ch >= 0 {
   184			ch := s.ch
   185			if ch == '\r' {
   186				numCR++
   187			}
   188			s.next()
   189			if ch == '*' && s.ch == '/' {
   190				s.next()
   191				next = s.offset
   192				goto exit
   193			}
   194		}
   195	
   196		s.error(offs, "comment not terminated")
   197	
   198	exit:
   199		lit := s.src[offs:s.offset]
   200	
   201		// On Windows, a (//-comment) line may end in "\r\n".
   202		// Remove the final '\r' before analyzing the text for
   203		// line directives (matching the compiler). Remove any
   204		// other '\r' afterwards (matching the pre-existing be-
   205		// havior of the scanner).
   206		if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
   207			lit = lit[:len(lit)-1]
   208			numCR--
   209		}
   210	
   211		// interpret line directives
   212		// (//line directives must start at the beginning of the current line)
   213		if next >= 0 /* implies valid comment */ && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
   214			s.updateLineInfo(next, offs, lit)
   215		}
   216	
   217		if numCR > 0 {
   218			lit = stripCR(lit, lit[1] == '*')
   219		}
   220	
   221		return string(lit)
   222	}
   223	
   224	var prefix = []byte("line ")
   225	
   226	// updateLineInfo parses the incoming comment text at offset offs
   227	// as a line directive. If successful, it updates the line info table
   228	// for the position next per the line directive.
   229	func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
   230		// extract comment text
   231		if text[1] == '*' {
   232			text = text[:len(text)-2] // lop off trailing "*/"
   233		}
   234		text = text[7:] // lop off leading "//line " or "/*line "
   235		offs += 7
   236	
   237		i, n, ok := trailingDigits(text)
   238		if i == 0 {
   239			return // ignore (not a line directive)
   240		}
   241		// i > 0
   242	
   243		if !ok {
   244			// text has a suffix :xxx but xxx is not a number
   245			s.error(offs+i, "invalid line number: "+string(text[i:]))
   246			return
   247		}
   248	
   249		var line, col int
   250		i2, n2, ok2 := trailingDigits(text[:i-1])
   251		if ok2 {
   252			//line filename:line:col
   253			i, i2 = i2, i
   254			line, col = n2, n
   255			if col == 0 {
   256				s.error(offs+i2, "invalid column number: "+string(text[i2:]))
   257				return
   258			}
   259			text = text[:i2-1] // lop off ":col"
   260		} else {
   261			//line filename:line
   262			line = n
   263		}
   264	
   265		if line == 0 {
   266			s.error(offs+i, "invalid line number: "+string(text[i:]))
   267			return
   268		}
   269	
   270		// If we have a column (//line filename:line:col form),
   271		// an empty filename means to use the previous filename.
   272		filename := string(text[:i-1]) // lop off ":line", and trim white space
   273		if filename == "" && ok2 {
   274			filename = s.file.Position(s.file.Pos(offs)).Filename
   275		} else if filename != "" {
   276			// Put a relative filename in the current directory.
   277			// This is for compatibility with earlier releases.
   278			// See issue 26671.
   279			filename = filepath.Clean(filename)
   280			if !filepath.IsAbs(filename) {
   281				filename = filepath.Join(s.dir, filename)
   282			}
   283		}
   284	
   285		s.file.AddLineColumnInfo(next, filename, line, col)
   286	}
   287	
   288	func trailingDigits(text []byte) (int, int, bool) {
   289		i := bytes.LastIndexByte(text, ':') // look from right (Windows filenames may contain ':')
   290		if i < 0 {
   291			return 0, 0, false // no ":"
   292		}
   293		// i >= 0
   294		n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
   295		return i + 1, int(n), err == nil
   296	}
   297	
   298	func (s *Scanner) findLineEnd() bool {
   299		// initial '/' already consumed
   300	
   301		defer func(offs int) {
   302			// reset scanner state to where it was upon calling findLineEnd
   303			s.ch = '/'
   304			s.offset = offs
   305			s.rdOffset = offs + 1
   306			s.next() // consume initial '/' again
   307		}(s.offset - 1)
   308	
   309		// read ahead until a newline, EOF, or non-comment token is found
   310		for s.ch == '/' || s.ch == '*' {
   311			if s.ch == '/' {
   312				//-style comment always contains a newline
   313				return true
   314			}
   315			/*-style comment: look for newline */
   316			s.next()
   317			for s.ch >= 0 {
   318				ch := s.ch
   319				if ch == '\n' {
   320					return true
   321				}
   322				s.next()
   323				if ch == '*' && s.ch == '/' {
   324					s.next()
   325					break
   326				}
   327			}
   328			s.skipWhitespace() // s.insertSemi is set
   329			if s.ch < 0 || s.ch == '\n' {
   330				return true
   331			}
   332			if s.ch != '/' {
   333				// non-comment token
   334				return false
   335			}
   336			s.next() // consume '/'
   337		}
   338	
   339		return false
   340	}
   341	
   342	func isLetter(ch rune) bool {
   343		return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
   344	}
   345	
   346	func isDigit(ch rune) bool {
   347		return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
   348	}
   349	
   350	func (s *Scanner) scanIdentifier() string {
   351		offs := s.offset
   352		for isLetter(s.ch) || isDigit(s.ch) {
   353			s.next()
   354		}
   355		return string(s.src[offs:s.offset])
   356	}
   357	
   358	func digitVal(ch rune) int {
   359		switch {
   360		case '0' <= ch && ch <= '9':
   361			return int(ch - '0')
   362		case 'a' <= lower(ch) && lower(ch) <= 'f':
   363			return int(lower(ch) - 'a' + 10)
   364		}
   365		return 16 // larger than any legal digit val
   366	}
   367	
   368	func lower(ch rune) rune     { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
   369	func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
   370	func isHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
   371	
   372	// digits accepts the sequence { digit | '_' }.
   373	// If base <= 10, digits accepts any decimal digit but records
   374	// the offset (relative to the source start) of a digit >= base
   375	// in *invalid, if *invalid < 0.
   376	// digits returns a bitset describing whether the sequence contained
   377	// digits (bit 0 is set), or separators '_' (bit 1 is set).
   378	func (s *Scanner) digits(base int, invalid *int) (digsep int) {
   379		if base <= 10 {
   380			max := rune('0' + base)
   381			for isDecimal(s.ch) || s.ch == '_' {
   382				ds := 1
   383				if s.ch == '_' {
   384					ds = 2
   385				} else if s.ch >= max && *invalid < 0 {
   386					*invalid = int(s.offset) // record invalid rune offset
   387				}
   388				digsep |= ds
   389				s.next()
   390			}
   391		} else {
   392			for isHex(s.ch) || s.ch == '_' {
   393				ds := 1
   394				if s.ch == '_' {
   395					ds = 2
   396				}
   397				digsep |= ds
   398				s.next()
   399			}
   400		}
   401		return
   402	}
   403	
   404	func (s *Scanner) scanNumber() (token.Token, string) {
   405		offs := s.offset
   406		tok := token.ILLEGAL
   407	
   408		base := 10        // number base
   409		prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
   410		digsep := 0       // bit 0: digit present, bit 1: '_' present
   411		invalid := -1     // index of invalid digit in literal, or < 0
   412	
   413		// integer part
   414		if s.ch != '.' {
   415			tok = token.INT
   416			if s.ch == '0' {
   417				s.next()
   418				switch lower(s.ch) {
   419				case 'x':
   420					s.next()
   421					base, prefix = 16, 'x'
   422				case 'o':
   423					s.next()
   424					base, prefix = 8, 'o'
   425				case 'b':
   426					s.next()
   427					base, prefix = 2, 'b'
   428				default:
   429					base, prefix = 8, '0'
   430					digsep = 1 // leading 0
   431				}
   432			}
   433			digsep |= s.digits(base, &invalid)
   434		}
   435	
   436		// fractional part
   437		if s.ch == '.' {
   438			tok = token.FLOAT
   439			if prefix == 'o' || prefix == 'b' {
   440				s.error(s.offset, "invalid radix point in "+litname(prefix))
   441			}
   442			s.next()
   443			digsep |= s.digits(base, &invalid)
   444		}
   445	
   446		if digsep&1 == 0 {
   447			s.error(s.offset, litname(prefix)+" has no digits")
   448		}
   449	
   450		// exponent
   451		if e := lower(s.ch); e == 'e' || e == 'p' {
   452			switch {
   453			case e == 'e' && prefix != 0 && prefix != '0':
   454				s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
   455			case e == 'p' && prefix != 'x':
   456				s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
   457			}
   458			s.next()
   459			tok = token.FLOAT
   460			if s.ch == '+' || s.ch == '-' {
   461				s.next()
   462			}
   463			ds := s.digits(10, nil)
   464			digsep |= ds
   465			if ds&1 == 0 {
   466				s.error(s.offset, "exponent has no digits")
   467			}
   468		} else if prefix == 'x' && tok == token.FLOAT {
   469			s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
   470		}
   471	
   472		// suffix 'i'
   473		if s.ch == 'i' {
   474			tok = token.IMAG
   475			s.next()
   476		}
   477	
   478		lit := string(s.src[offs:s.offset])
   479		if tok == token.INT && invalid >= 0 {
   480			s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
   481		}
   482		if digsep&2 != 0 {
   483			if i := invalidSep(lit); i >= 0 {
   484				s.error(offs+i, "'_' must separate successive digits")
   485			}
   486		}
   487	
   488		return tok, lit
   489	}
   490	
   491	func litname(prefix rune) string {
   492		switch prefix {
   493		case 'x':
   494			return "hexadecimal literal"
   495		case 'o', '0':
   496			return "octal literal"
   497		case 'b':
   498			return "binary literal"
   499		}
   500		return "decimal literal"
   501	}
   502	
   503	// invalidSep returns the index of the first invalid separator in x, or -1.
   504	func invalidSep(x string) int {
   505		x1 := ' ' // prefix char, we only care if it's 'x'
   506		d := '.'  // digit, one of '_', '0' (a digit), or '.' (anything else)
   507		i := 0
   508	
   509		// a prefix counts as a digit
   510		if len(x) >= 2 && x[0] == '0' {
   511			x1 = lower(rune(x[1]))
   512			if x1 == 'x' || x1 == 'o' || x1 == 'b' {
   513				d = '0'
   514				i = 2
   515			}
   516		}
   517	
   518		// mantissa and exponent
   519		for ; i < len(x); i++ {
   520			p := d // previous digit
   521			d = rune(x[i])
   522			switch {
   523			case d == '_':
   524				if p != '0' {
   525					return i
   526				}
   527			case isDecimal(d) || x1 == 'x' && isHex(d):
   528				d = '0'
   529			default:
   530				if p == '_' {
   531					return i - 1
   532				}
   533				d = '.'
   534			}
   535		}
   536		if d == '_' {
   537			return len(x) - 1
   538		}
   539	
   540		return -1
   541	}
   542	
   543	// scanEscape parses an escape sequence where rune is the accepted
   544	// escaped quote. In case of a syntax error, it stops at the offending
   545	// character (without consuming it) and returns false. Otherwise
   546	// it returns true.
   547	func (s *Scanner) scanEscape(quote rune) bool {
   548		offs := s.offset
   549	
   550		var n int
   551		var base, max uint32
   552		switch s.ch {
   553		case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   554			s.next()
   555			return true
   556		case '0', '1', '2', '3', '4', '5', '6', '7':
   557			n, base, max = 3, 8, 255
   558		case 'x':
   559			s.next()
   560			n, base, max = 2, 16, 255
   561		case 'u':
   562			s.next()
   563			n, base, max = 4, 16, unicode.MaxRune
   564		case 'U':
   565			s.next()
   566			n, base, max = 8, 16, unicode.MaxRune
   567		default:
   568			msg := "unknown escape sequence"
   569			if s.ch < 0 {
   570				msg = "escape sequence not terminated"
   571			}
   572			s.error(offs, msg)
   573			return false
   574		}
   575	
   576		var x uint32
   577		for n > 0 {
   578			d := uint32(digitVal(s.ch))
   579			if d >= base {
   580				msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
   581				if s.ch < 0 {
   582					msg = "escape sequence not terminated"
   583				}
   584				s.error(s.offset, msg)
   585				return false
   586			}
   587			x = x*base + d
   588			s.next()
   589			n--
   590		}
   591	
   592		if x > max || 0xD800 <= x && x < 0xE000 {
   593			s.error(offs, "escape sequence is invalid Unicode code point")
   594			return false
   595		}
   596	
   597		return true
   598	}
   599	
   600	func (s *Scanner) scanRune() string {
   601		// '\'' opening already consumed
   602		offs := s.offset - 1
   603	
   604		valid := true
   605		n := 0
   606		for {
   607			ch := s.ch
   608			if ch == '\n' || ch < 0 {
   609				// only report error if we don't have one already
   610				if valid {
   611					s.error(offs, "rune literal not terminated")
   612					valid = false
   613				}
   614				break
   615			}
   616			s.next()
   617			if ch == '\'' {
   618				break
   619			}
   620			n++
   621			if ch == '\\' {
   622				if !s.scanEscape('\'') {
   623					valid = false
   624				}
   625				// continue to read to closing quote
   626			}
   627		}
   628	
   629		if valid && n != 1 {
   630			s.error(offs, "illegal rune literal")
   631		}
   632	
   633		return string(s.src[offs:s.offset])
   634	}
   635	
   636	func (s *Scanner) scanString() string {
   637		// '"' opening already consumed
   638		offs := s.offset - 1
   639	
   640		for {
   641			ch := s.ch
   642			if ch == '\n' || ch < 0 {
   643				s.error(offs, "string literal not terminated")
   644				break
   645			}
   646			s.next()
   647			if ch == '"' {
   648				break
   649			}
   650			if ch == '\\' {
   651				s.scanEscape('"')
   652			}
   653		}
   654	
   655		return string(s.src[offs:s.offset])
   656	}
   657	
   658	func stripCR(b []byte, comment bool) []byte {
   659		c := make([]byte, len(b))
   660		i := 0
   661		for j, ch := range b {
   662			// In a /*-style comment, don't strip \r from *\r/ (incl.
   663			// sequences of \r from *\r\r...\r/) since the resulting
   664			// */ would terminate the comment too early unless the \r
   665			// is immediately following the opening /* in which case
   666			// it's ok because /*/ is not closed yet (issue #11151).
   667			if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
   668				c[i] = ch
   669				i++
   670			}
   671		}
   672		return c[:i]
   673	}
   674	
   675	func (s *Scanner) scanRawString() string {
   676		// '`' opening already consumed
   677		offs := s.offset - 1
   678	
   679		hasCR := false
   680		for {
   681			ch := s.ch
   682			if ch < 0 {
   683				s.error(offs, "raw string literal not terminated")
   684				break
   685			}
   686			s.next()
   687			if ch == '`' {
   688				break
   689			}
   690			if ch == '\r' {
   691				hasCR = true
   692			}
   693		}
   694	
   695		lit := s.src[offs:s.offset]
   696		if hasCR {
   697			lit = stripCR(lit, false)
   698		}
   699	
   700		return string(lit)
   701	}
   702	
   703	func (s *Scanner) skipWhitespace() {
   704		for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
   705			s.next()
   706		}
   707	}
   708	
   709	// Helper functions for scanning multi-byte tokens such as >> += >>= .
   710	// Different routines recognize different length tok_i based on matches
   711	// of ch_i. If a token ends in '=', the result is tok1 or tok3
   712	// respectively. Otherwise, the result is tok0 if there was no other
   713	// matching character, or tok2 if the matching character was ch2.
   714	
   715	func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
   716		if s.ch == '=' {
   717			s.next()
   718			return tok1
   719		}
   720		return tok0
   721	}
   722	
   723	func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
   724		if s.ch == '=' {
   725			s.next()
   726			return tok1
   727		}
   728		if s.ch == ch2 {
   729			s.next()
   730			return tok2
   731		}
   732		return tok0
   733	}
   734	
   735	func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
   736		if s.ch == '=' {
   737			s.next()
   738			return tok1
   739		}
   740		if s.ch == ch2 {
   741			s.next()
   742			if s.ch == '=' {
   743				s.next()
   744				return tok3
   745			}
   746			return tok2
   747		}
   748		return tok0
   749	}
   750	
   751	// Scan scans the next token and returns the token position, the token,
   752	// and its literal string if applicable. The source end is indicated by
   753	// token.EOF.
   754	//
   755	// If the returned token is a literal (token.IDENT, token.INT, token.FLOAT,
   756	// token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string
   757	// has the corresponding value.
   758	//
   759	// If the returned token is a keyword, the literal string is the keyword.
   760	//
   761	// If the returned token is token.SEMICOLON, the corresponding
   762	// literal string is ";" if the semicolon was present in the source,
   763	// and "\n" if the semicolon was inserted because of a newline or
   764	// at EOF.
   765	//
   766	// If the returned token is token.ILLEGAL, the literal string is the
   767	// offending character.
   768	//
   769	// In all other cases, Scan returns an empty literal string.
   770	//
   771	// For more tolerant parsing, Scan will return a valid token if
   772	// possible even if a syntax error was encountered. Thus, even
   773	// if the resulting token sequence contains no illegal tokens,
   774	// a client may not assume that no error occurred. Instead it
   775	// must check the scanner's ErrorCount or the number of calls
   776	// of the error handler, if there was one installed.
   777	//
   778	// Scan adds line information to the file added to the file
   779	// set with Init. Token positions are relative to that file
   780	// and thus relative to the file set.
   781	//
   782	func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
   783	scanAgain:
   784		s.skipWhitespace()
   785	
   786		// current token start
   787		pos = s.file.Pos(s.offset)
   788	
   789		// determine token value
   790		insertSemi := false
   791		switch ch := s.ch; {
   792		case isLetter(ch):
   793			lit = s.scanIdentifier()
   794			if len(lit) > 1 {
   795				// keywords are longer than one letter - avoid lookup otherwise
   796				tok = token.Lookup(lit)
   797				switch tok {
   798				case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
   799					insertSemi = true
   800				}
   801			} else {
   802				insertSemi = true
   803				tok = token.IDENT
   804			}
   805		case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
   806			insertSemi = true
   807			tok, lit = s.scanNumber()
   808		default:
   809			s.next() // always make progress
   810			switch ch {
   811			case -1:
   812				if s.insertSemi {
   813					s.insertSemi = false // EOF consumed
   814					return pos, token.SEMICOLON, "\n"
   815				}
   816				tok = token.EOF
   817			case '\n':
   818				// we only reach here if s.insertSemi was
   819				// set in the first place and exited early
   820				// from s.skipWhitespace()
   821				s.insertSemi = false // newline consumed
   822				return pos, token.SEMICOLON, "\n"
   823			case '"':
   824				insertSemi = true
   825				tok = token.STRING
   826				lit = s.scanString()
   827			case '\'':
   828				insertSemi = true
   829				tok = token.CHAR
   830				lit = s.scanRune()
   831			case '`':
   832				insertSemi = true
   833				tok = token.STRING
   834				lit = s.scanRawString()
   835			case ':':
   836				tok = s.switch2(token.COLON, token.DEFINE)
   837			case '.':
   838				// fractions starting with a '.' are handled by outer switch
   839				tok = token.PERIOD
   840				if s.ch == '.' && s.peek() == '.' {
   841					s.next()
   842					s.next() // consume last '.'
   843					tok = token.ELLIPSIS
   844				}
   845			case ',':
   846				tok = token.COMMA
   847			case ';':
   848				tok = token.SEMICOLON
   849				lit = ";"
   850			case '(':
   851				tok = token.LPAREN
   852			case ')':
   853				insertSemi = true
   854				tok = token.RPAREN
   855			case '[':
   856				tok = token.LBRACK
   857			case ']':
   858				insertSemi = true
   859				tok = token.RBRACK
   860			case '{':
   861				tok = token.LBRACE
   862			case '}':
   863				insertSemi = true
   864				tok = token.RBRACE
   865			case '+':
   866				tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
   867				if tok == token.INC {
   868					insertSemi = true
   869				}
   870			case '-':
   871				tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
   872				if tok == token.DEC {
   873					insertSemi = true
   874				}
   875			case '*':
   876				tok = s.switch2(token.MUL, token.MUL_ASSIGN)
   877			case '/':
   878				if s.ch == '/' || s.ch == '*' {
   879					// comment
   880					if s.insertSemi && s.findLineEnd() {
   881						// reset position to the beginning of the comment
   882						s.ch = '/'
   883						s.offset = s.file.Offset(pos)
   884						s.rdOffset = s.offset + 1
   885						s.insertSemi = false // newline consumed
   886						return pos, token.SEMICOLON, "\n"
   887					}
   888					comment := s.scanComment()
   889					if s.mode&ScanComments == 0 {
   890						// skip comment
   891						s.insertSemi = false // newline consumed
   892						goto scanAgain
   893					}
   894					tok = token.COMMENT
   895					lit = comment
   896				} else {
   897					tok = s.switch2(token.QUO, token.QUO_ASSIGN)
   898				}
   899			case '%':
   900				tok = s.switch2(token.REM, token.REM_ASSIGN)
   901			case '^':
   902				tok = s.switch2(token.XOR, token.XOR_ASSIGN)
   903			case '<':
   904				if s.ch == '-' {
   905					s.next()
   906					tok = token.ARROW
   907				} else {
   908					tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
   909				}
   910			case '>':
   911				tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
   912			case '=':
   913				tok = s.switch2(token.ASSIGN, token.EQL)
   914			case '!':
   915				tok = s.switch2(token.NOT, token.NEQ)
   916			case '&':
   917				if s.ch == '^' {
   918					s.next()
   919					tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
   920				} else {
   921					tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
   922				}
   923			case '|':
   924				tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
   925			default:
   926				// next reports unexpected BOMs - don't repeat
   927				if ch != bom {
   928					s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
   929				}
   930				insertSemi = s.insertSemi // preserve insertSemi info
   931				tok = token.ILLEGAL
   932				lit = string(ch)
   933			}
   934		}
   935		if s.mode&dontInsertSemis == 0 {
   936			s.insertSemi = insertSemi
   937		}
   938	
   939		return
   940	}
   941	

View as plain text