Source file src/encoding/csv/reader.go

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package csv reads and writes comma-separated values (CSV) files.
     6	// There are many kinds of CSV files; this package supports the format
     7	// described in RFC 4180.
     8	//
     9	// A csv file contains zero or more records of one or more fields per record.
    10	// Each record is separated by the newline character. The final record may
    11	// optionally be followed by a newline character.
    12	//
    13	//	field1,field2,field3
    14	//
    15	// White space is considered part of a field.
    16	//
    17	// Carriage returns before newline characters are silently removed.
    18	//
    19	// Blank lines are ignored. A line with only whitespace characters (excluding
    20	// the ending newline character) is not considered a blank line.
    21	//
    22	// Fields which start and stop with the quote character " are called
    23	// quoted-fields. The beginning and ending quote are not part of the
    24	// field.
    25	//
    26	// The source:
    27	//
    28	//	normal string,"quoted-field"
    29	//
    30	// results in the fields
    31	//
    32	//	{`normal string`, `quoted-field`}
    33	//
    34	// Within a quoted-field a quote character followed by a second quote
    35	// character is considered a single quote.
    36	//
    37	//	"the ""word"" is true","a ""quoted-field"""
    38	//
    39	// results in
    40	//
    41	//	{`the "word" is true`, `a "quoted-field"`}
    42	//
    43	// Newlines and commas may be included in a quoted-field
    44	//
    45	//	"Multi-line
    46	//	field","comma is ,"
    47	//
    48	// results in
    49	//
    50	//	{`Multi-line
    51	//	field`, `comma is ,`}
    52	package csv
    53	
    54	import (
    55		"bufio"
    56		"bytes"
    57		"errors"
    58		"fmt"
    59		"io"
    60		"unicode"
    61		"unicode/utf8"
    62	)
    63	
    64	// A ParseError is returned for parsing errors.
    65	// Line numbers are 1-indexed and columns are 0-indexed.
    66	type ParseError struct {
    67		StartLine int   // Line where the record starts
    68		Line      int   // Line where the error occurred
    69		Column    int   // Column (rune index) where the error occurred
    70		Err       error // The actual error
    71	}
    72	
    73	func (e *ParseError) Error() string {
    74		if e.Err == ErrFieldCount {
    75			return fmt.Sprintf("record on line %d: %v", e.Line, e.Err)
    76		}
    77		if e.StartLine != e.Line {
    78			return fmt.Sprintf("record on line %d; parse error on line %d, column %d: %v", e.StartLine, e.Line, e.Column, e.Err)
    79		}
    80		return fmt.Sprintf("parse error on line %d, column %d: %v", e.Line, e.Column, e.Err)
    81	}
    82	
    83	func (e *ParseError) Unwrap() error { return e.Err }
    84	
    85	// These are the errors that can be returned in ParseError.Err.
    86	var (
    87		ErrTrailingComma = errors.New("extra delimiter at end of line") // Deprecated: No longer used.
    88		ErrBareQuote     = errors.New("bare \" in non-quoted-field")
    89		ErrQuote         = errors.New("extraneous or missing \" in quoted-field")
    90		ErrFieldCount    = errors.New("wrong number of fields")
    91	)
    92	
    93	var errInvalidDelim = errors.New("csv: invalid field or comment delimiter")
    94	
    95	func validDelim(r rune) bool {
    96		return r != 0 && r != '"' && r != '\r' && r != '\n' && utf8.ValidRune(r) && r != utf8.RuneError
    97	}
    98	
    99	// A Reader reads records from a CSV-encoded file.
   100	//
   101	// As returned by NewReader, a Reader expects input conforming to RFC 4180.
   102	// The exported fields can be changed to customize the details before the
   103	// first call to Read or ReadAll.
   104	//
   105	// The Reader converts all \r\n sequences in its input to plain \n,
   106	// including in multiline field values, so that the returned data does
   107	// not depend on which line-ending convention an input file uses.
   108	type Reader struct {
   109		// Comma is the field delimiter.
   110		// It is set to comma (',') by NewReader.
   111		// Comma must be a valid rune and must not be \r, \n,
   112		// or the Unicode replacement character (0xFFFD).
   113		Comma rune
   114	
   115		// Comment, if not 0, is the comment character. Lines beginning with the
   116		// Comment character without preceding whitespace are ignored.
   117		// With leading whitespace the Comment character becomes part of the
   118		// field, even if TrimLeadingSpace is true.
   119		// Comment must be a valid rune and must not be \r, \n,
   120		// or the Unicode replacement character (0xFFFD).
   121		// It must also not be equal to Comma.
   122		Comment rune
   123	
   124		// FieldsPerRecord is the number of expected fields per record.
   125		// If FieldsPerRecord is positive, Read requires each record to
   126		// have the given number of fields. If FieldsPerRecord is 0, Read sets it to
   127		// the number of fields in the first record, so that future records must
   128		// have the same field count. If FieldsPerRecord is negative, no check is
   129		// made and records may have a variable number of fields.
   130		FieldsPerRecord int
   131	
   132		// If LazyQuotes is true, a quote may appear in an unquoted field and a
   133		// non-doubled quote may appear in a quoted field.
   134		LazyQuotes bool
   135	
   136		// If TrimLeadingSpace is true, leading white space in a field is ignored.
   137		// This is done even if the field delimiter, Comma, is white space.
   138		TrimLeadingSpace bool
   139	
   140		// ReuseRecord controls whether calls to Read may return a slice sharing
   141		// the backing array of the previous call's returned slice for performance.
   142		// By default, each call to Read returns newly allocated memory owned by the caller.
   143		ReuseRecord bool
   144	
   145		TrailingComma bool // Deprecated: No longer used.
   146	
   147		r *bufio.Reader
   148	
   149		// numLine is the current line being read in the CSV file.
   150		numLine int
   151	
   152		// rawBuffer is a line buffer only used by the readLine method.
   153		rawBuffer []byte
   154	
   155		// recordBuffer holds the unescaped fields, one after another.
   156		// The fields can be accessed by using the indexes in fieldIndexes.
   157		// E.g., For the row `a,"b","c""d",e`, recordBuffer will contain `abc"de`
   158		// and fieldIndexes will contain the indexes [1, 2, 5, 6].
   159		recordBuffer []byte
   160	
   161		// fieldIndexes is an index of fields inside recordBuffer.
   162		// The i'th field ends at offset fieldIndexes[i] in recordBuffer.
   163		fieldIndexes []int
   164	
   165		// lastRecord is a record cache and only used when ReuseRecord == true.
   166		lastRecord []string
   167	}
   168	
   169	// NewReader returns a new Reader that reads from r.
   170	func NewReader(r io.Reader) *Reader {
   171		return &Reader{
   172			Comma: ',',
   173			r:     bufio.NewReader(r),
   174		}
   175	}
   176	
   177	// Read reads one record (a slice of fields) from r.
   178	// If the record has an unexpected number of fields,
   179	// Read returns the record along with the error ErrFieldCount.
   180	// Except for that case, Read always returns either a non-nil
   181	// record or a non-nil error, but not both.
   182	// If there is no data left to be read, Read returns nil, io.EOF.
   183	// If ReuseRecord is true, the returned slice may be shared
   184	// between multiple calls to Read.
   185	func (r *Reader) Read() (record []string, err error) {
   186		if r.ReuseRecord {
   187			record, err = r.readRecord(r.lastRecord)
   188			r.lastRecord = record
   189		} else {
   190			record, err = r.readRecord(nil)
   191		}
   192		return record, err
   193	}
   194	
   195	// ReadAll reads all the remaining records from r.
   196	// Each record is a slice of fields.
   197	// A successful call returns err == nil, not err == io.EOF. Because ReadAll is
   198	// defined to read until EOF, it does not treat end of file as an error to be
   199	// reported.
   200	func (r *Reader) ReadAll() (records [][]string, err error) {
   201		for {
   202			record, err := r.readRecord(nil)
   203			if err == io.EOF {
   204				return records, nil
   205			}
   206			if err != nil {
   207				return nil, err
   208			}
   209			records = append(records, record)
   210		}
   211	}
   212	
   213	// readLine reads the next line (with the trailing endline).
   214	// If EOF is hit without a trailing endline, it will be omitted.
   215	// If some bytes were read, then the error is never io.EOF.
   216	// The result is only valid until the next call to readLine.
   217	func (r *Reader) readLine() ([]byte, error) {
   218		line, err := r.r.ReadSlice('\n')
   219		if err == bufio.ErrBufferFull {
   220			r.rawBuffer = append(r.rawBuffer[:0], line...)
   221			for err == bufio.ErrBufferFull {
   222				line, err = r.r.ReadSlice('\n')
   223				r.rawBuffer = append(r.rawBuffer, line...)
   224			}
   225			line = r.rawBuffer
   226		}
   227		if len(line) > 0 && err == io.EOF {
   228			err = nil
   229			// For backwards compatibility, drop trailing \r before EOF.
   230			if line[len(line)-1] == '\r' {
   231				line = line[:len(line)-1]
   232			}
   233		}
   234		r.numLine++
   235		// Normalize \r\n to \n on all input lines.
   236		if n := len(line); n >= 2 && line[n-2] == '\r' && line[n-1] == '\n' {
   237			line[n-2] = '\n'
   238			line = line[:n-1]
   239		}
   240		return line, err
   241	}
   242	
   243	// lengthNL reports the number of bytes for the trailing \n.
   244	func lengthNL(b []byte) int {
   245		if len(b) > 0 && b[len(b)-1] == '\n' {
   246			return 1
   247		}
   248		return 0
   249	}
   250	
   251	// nextRune returns the next rune in b or utf8.RuneError.
   252	func nextRune(b []byte) rune {
   253		r, _ := utf8.DecodeRune(b)
   254		return r
   255	}
   256	
   257	func (r *Reader) readRecord(dst []string) ([]string, error) {
   258		if r.Comma == r.Comment || !validDelim(r.Comma) || (r.Comment != 0 && !validDelim(r.Comment)) {
   259			return nil, errInvalidDelim
   260		}
   261	
   262		// Read line (automatically skipping past empty lines and any comments).
   263		var line, fullLine []byte
   264		var errRead error
   265		for errRead == nil {
   266			line, errRead = r.readLine()
   267			if r.Comment != 0 && nextRune(line) == r.Comment {
   268				line = nil
   269				continue // Skip comment lines
   270			}
   271			if errRead == nil && len(line) == lengthNL(line) {
   272				line = nil
   273				continue // Skip empty lines
   274			}
   275			fullLine = line
   276			break
   277		}
   278		if errRead == io.EOF {
   279			return nil, errRead
   280		}
   281	
   282		// Parse each field in the record.
   283		var err error
   284		const quoteLen = len(`"`)
   285		commaLen := utf8.RuneLen(r.Comma)
   286		recLine := r.numLine // Starting line for record
   287		r.recordBuffer = r.recordBuffer[:0]
   288		r.fieldIndexes = r.fieldIndexes[:0]
   289	parseField:
   290		for {
   291			if r.TrimLeadingSpace {
   292				line = bytes.TrimLeftFunc(line, unicode.IsSpace)
   293			}
   294			if len(line) == 0 || line[0] != '"' {
   295				// Non-quoted string field
   296				i := bytes.IndexRune(line, r.Comma)
   297				field := line
   298				if i >= 0 {
   299					field = field[:i]
   300				} else {
   301					field = field[:len(field)-lengthNL(field)]
   302				}
   303				// Check to make sure a quote does not appear in field.
   304				if !r.LazyQuotes {
   305					if j := bytes.IndexByte(field, '"'); j >= 0 {
   306						col := utf8.RuneCount(fullLine[:len(fullLine)-len(line[j:])])
   307						err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrBareQuote}
   308						break parseField
   309					}
   310				}
   311				r.recordBuffer = append(r.recordBuffer, field...)
   312				r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
   313				if i >= 0 {
   314					line = line[i+commaLen:]
   315					continue parseField
   316				}
   317				break parseField
   318			} else {
   319				// Quoted string field
   320				line = line[quoteLen:]
   321				for {
   322					i := bytes.IndexByte(line, '"')
   323					if i >= 0 {
   324						// Hit next quote.
   325						r.recordBuffer = append(r.recordBuffer, line[:i]...)
   326						line = line[i+quoteLen:]
   327						switch rn := nextRune(line); {
   328						case rn == '"':
   329							// `""` sequence (append quote).
   330							r.recordBuffer = append(r.recordBuffer, '"')
   331							line = line[quoteLen:]
   332						case rn == r.Comma:
   333							// `",` sequence (end of field).
   334							line = line[commaLen:]
   335							r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
   336							continue parseField
   337						case lengthNL(line) == len(line):
   338							// `"\n` sequence (end of line).
   339							r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
   340							break parseField
   341						case r.LazyQuotes:
   342							// `"` sequence (bare quote).
   343							r.recordBuffer = append(r.recordBuffer, '"')
   344						default:
   345							// `"*` sequence (invalid non-escaped quote).
   346							col := utf8.RuneCount(fullLine[:len(fullLine)-len(line)-quoteLen])
   347							err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
   348							break parseField
   349						}
   350					} else if len(line) > 0 {
   351						// Hit end of line (copy all data so far).
   352						r.recordBuffer = append(r.recordBuffer, line...)
   353						if errRead != nil {
   354							break parseField
   355						}
   356						line, errRead = r.readLine()
   357						if errRead == io.EOF {
   358							errRead = nil
   359						}
   360						fullLine = line
   361					} else {
   362						// Abrupt end of file (EOF or error).
   363						if !r.LazyQuotes && errRead == nil {
   364							col := utf8.RuneCount(fullLine)
   365							err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
   366							break parseField
   367						}
   368						r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
   369						break parseField
   370					}
   371				}
   372			}
   373		}
   374		if err == nil {
   375			err = errRead
   376		}
   377	
   378		// Create a single string and create slices out of it.
   379		// This pins the memory of the fields together, but allocates once.
   380		str := string(r.recordBuffer) // Convert to string once to batch allocations
   381		dst = dst[:0]
   382		if cap(dst) < len(r.fieldIndexes) {
   383			dst = make([]string, len(r.fieldIndexes))
   384		}
   385		dst = dst[:len(r.fieldIndexes)]
   386		var preIdx int
   387		for i, idx := range r.fieldIndexes {
   388			dst[i] = str[preIdx:idx]
   389			preIdx = idx
   390		}
   391	
   392		// Check or update the expected fields per record.
   393		if r.FieldsPerRecord > 0 {
   394			if len(dst) != r.FieldsPerRecord && err == nil {
   395				err = &ParseError{StartLine: recLine, Line: recLine, Err: ErrFieldCount}
   396			}
   397		} else if r.FieldsPerRecord == 0 {
   398			r.FieldsPerRecord = len(dst)
   399		}
   400		return dst, err
   401	}
   402
View as plain text