Source file src/pkg/cmd/compile/internal/syntax/source.go

     1	// Copyright 2016 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// This file implements source, a buffered rune reader
     6	// which is specialized for the needs of the Go scanner:
     7	// Contiguous sequences of runes (literals) are extracted
     8	// directly as []byte without the need to re-encode the
     9	// runes in UTF-8 (as would be necessary with bufio.Reader).
    10	//
    11	// This file is self-contained (go tool compile source.go
    12	// compiles) and thus could be made into its own package.
    13	
    14	package syntax
    15	
    16	import (
    17		"io"
    18		"unicode/utf8"
    19	)
    20	
    21	// starting points for line and column numbers
    22	const linebase = 1
    23	const colbase = 1
    24	
    25	// max. number of bytes to unread
    26	const maxunread = 10
    27	
    28	// buf [...read...|...|...unread...|s|...free...]
    29	//         ^      ^   ^            ^
    30	//         |      |   |            |
    31	//        suf     r0  r            w
    32	
    33	type source struct {
    34		src  io.Reader
    35		errh func(line, pos uint, msg string)
    36	
    37		// source buffer
    38		buf         [4 << 10]byte
    39		r0, r, w    int   // previous/current read and write buf positions, excluding sentinel
    40		line0, line uint  // previous/current line
    41		col0, col   uint  // previous/current column (byte offsets from line start)
    42		ioerr       error // pending io error
    43	
    44		// literal buffer
    45		lit []byte // literal prefix
    46		suf int    // literal suffix; suf >= 0 means we are scanning a literal
    47	}
    48	
    49	// init initializes source to read from src and to report errors via errh.
    50	// errh must not be nil.
    51	func (s *source) init(src io.Reader, errh func(line, pos uint, msg string)) {
    52		s.src = src
    53		s.errh = errh
    54	
    55		s.buf[0] = utf8.RuneSelf // terminate with sentinel
    56		s.r0, s.r, s.w = 0, 0, 0
    57		s.line0, s.line = 0, linebase
    58		s.col0, s.col = 0, colbase
    59		s.ioerr = nil
    60	
    61		s.lit = s.lit[:0]
    62		s.suf = -1
    63	}
    64	
    65	// ungetr sets the reading position to a previous reading
    66	// position, usually the one of the most recently read
    67	// rune, but possibly earlier (see unread below).
    68	func (s *source) ungetr() {
    69		s.r, s.line, s.col = s.r0, s.line0, s.col0
    70	}
    71	
    72	// unread moves the previous reading position to a position
    73	// that is n bytes earlier in the source. The next ungetr
    74	// call will set the reading position to that moved position.
    75	// The "unread" runes must be single byte and not contain any
    76	// newlines; and 0 <= n <= maxunread must hold.
    77	func (s *source) unread(n int) {
    78		s.r0 -= n
    79		s.col0 -= uint(n)
    80	}
    81	
    82	func (s *source) error(msg string) {
    83		s.errh(s.line0, s.col0, msg)
    84	}
    85	
    86	// getr reads and returns the next rune.
    87	//
    88	// If a read or source encoding error occurs, getr
    89	// calls the error handler installed with init.
    90	// The handler must exist.
    91	//
    92	// The (line, col) position passed to the error handler
    93	// is always at the current source reading position.
    94	func (s *source) getr() rune {
    95	redo:
    96		s.r0, s.line0, s.col0 = s.r, s.line, s.col
    97	
    98		// We could avoid at least one test that is always taken in the
    99		// for loop below by duplicating the common case code (ASCII)
   100		// here since we always have at least the sentinel (utf8.RuneSelf)
   101		// in the buffer. Measure and optimize if necessary.
   102	
   103		// make sure we have at least one rune in buffer, or we are at EOF
   104		for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.ioerr == nil && s.w-s.r < len(s.buf) {
   105			s.fill() // s.w-s.r < len(s.buf) => buffer is not full
   106		}
   107	
   108		// common case: ASCII and enough bytes
   109		// (invariant: s.buf[s.w] == utf8.RuneSelf)
   110		if b := s.buf[s.r]; b < utf8.RuneSelf {
   111			s.r++
   112			// TODO(gri) Optimization: Instead of adjusting s.col for each character,
   113			// remember the line offset instead and then compute the offset as needed
   114			// (which is less often).
   115			s.col++
   116			if b == 0 {
   117				s.error("invalid NUL character")
   118				goto redo
   119			}
   120			if b == '\n' {
   121				s.line++
   122				s.col = colbase
   123			}
   124			return rune(b)
   125		}
   126	
   127		// EOF
   128		if s.r == s.w {
   129			if s.ioerr != io.EOF {
   130				// ensure we never start with a '/' (e.g., rooted path) in the error message
   131				s.error("I/O error: " + s.ioerr.Error())
   132			}
   133			return -1
   134		}
   135	
   136		// uncommon case: not ASCII
   137		r, w := utf8.DecodeRune(s.buf[s.r:s.w])
   138		s.r += w
   139		s.col += uint(w)
   140	
   141		if r == utf8.RuneError && w == 1 {
   142			s.error("invalid UTF-8 encoding")
   143			goto redo
   144		}
   145	
   146		// BOM's are only allowed as the first character in a file
   147		const BOM = 0xfeff
   148		if r == BOM {
   149			if s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to maxunread)
   150				s.error("invalid BOM in the middle of the file")
   151			}
   152			goto redo
   153		}
   154	
   155		return r
   156	}
   157	
   158	func (s *source) fill() {
   159		// Slide unread bytes to beginning but preserve last read char
   160		// (for one ungetr call) plus maxunread extra bytes (for one
   161		// unread call).
   162		if s.r0 > maxunread {
   163			n := s.r0 - maxunread // number of bytes to slide down
   164			// save literal prefix, if any
   165			// (make sure we keep maxunread bytes and the last
   166			// read char in the buffer)
   167			if s.suf >= 0 {
   168				// we have a literal
   169				if s.suf < n {
   170					// save literal prefix
   171					s.lit = append(s.lit, s.buf[s.suf:n]...)
   172					s.suf = 0
   173				} else {
   174					s.suf -= n
   175				}
   176			}
   177			copy(s.buf[:], s.buf[n:s.w])
   178			s.r0 = maxunread // eqv: s.r0 -= n
   179			s.r -= n
   180			s.w -= n
   181		}
   182	
   183		// read more data: try a limited number of times
   184		for i := 100; i > 0; i-- {
   185			n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel
   186			if n < 0 {
   187				panic("negative read") // incorrect underlying io.Reader implementation
   188			}
   189			s.w += n
   190			if n > 0 || err != nil {
   191				s.buf[s.w] = utf8.RuneSelf // sentinel
   192				if err != nil {
   193					s.ioerr = err
   194				}
   195				return
   196			}
   197		}
   198	
   199		s.buf[s.w] = utf8.RuneSelf // sentinel
   200		s.ioerr = io.ErrNoProgress
   201	}
   202	
   203	func (s *source) startLit() {
   204		s.suf = s.r0
   205		s.lit = s.lit[:0] // reuse lit
   206	}
   207	
   208	func (s *source) stopLit() []byte {
   209		lit := s.buf[s.suf:s.r]
   210		if len(s.lit) > 0 {
   211			lit = append(s.lit, lit...)
   212		}
   213		s.killLit()
   214		return lit
   215	}
   216	
   217	func (s *source) killLit() {
   218		s.suf = -1 // no pending literal
   219	}
   220
View as plain text