Source file src/pkg/archive/zip/reader.go

     1	// Copyright 2010 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package zip
     6	
     7	import (
     8		"bufio"
     9		"encoding/binary"
    10		"errors"
    11		"fmt"
    12		"hash"
    13		"hash/crc32"
    14		"io"
    15		"os"
    16		"time"
    17	)
    18	
    19	var (
    20		ErrFormat    = errors.New("zip: not a valid zip file")
    21		ErrAlgorithm = errors.New("zip: unsupported compression algorithm")
    22		ErrChecksum  = errors.New("zip: checksum error")
    23	)
    24	
    25	type Reader struct {
    26		r             io.ReaderAt
    27		File          []*File
    28		Comment       string
    29		decompressors map[uint16]Decompressor
    30	}
    31	
    32	type ReadCloser struct {
    33		f *os.File
    34		Reader
    35	}
    36	
    37	type File struct {
    38		FileHeader
    39		zip          *Reader
    40		zipr         io.ReaderAt
    41		zipsize      int64
    42		headerOffset int64
    43	}
    44	
    45	func (f *File) hasDataDescriptor() bool {
    46		return f.Flags&0x8 != 0
    47	}
    48	
    49	// OpenReader will open the Zip file specified by name and return a ReadCloser.
    50	func OpenReader(name string) (*ReadCloser, error) {
    51		f, err := os.Open(name)
    52		if err != nil {
    53			return nil, err
    54		}
    55		fi, err := f.Stat()
    56		if err != nil {
    57			f.Close()
    58			return nil, err
    59		}
    60		r := new(ReadCloser)
    61		if err := r.init(f, fi.Size()); err != nil {
    62			f.Close()
    63			return nil, err
    64		}
    65		r.f = f
    66		return r, nil
    67	}
    68	
    69	// NewReader returns a new Reader reading from r, which is assumed to
    70	// have the given size in bytes.
    71	func NewReader(r io.ReaderAt, size int64) (*Reader, error) {
    72		if size < 0 {
    73			return nil, errors.New("zip: size cannot be negative")
    74		}
    75		zr := new(Reader)
    76		if err := zr.init(r, size); err != nil {
    77			return nil, err
    78		}
    79		return zr, nil
    80	}
    81	
    82	func (z *Reader) init(r io.ReaderAt, size int64) error {
    83		end, err := readDirectoryEnd(r, size)
    84		if err != nil {
    85			return err
    86		}
    87		if end.directoryRecords > uint64(size)/fileHeaderLen {
    88			return fmt.Errorf("archive/zip: TOC declares impossible %d files in %d byte zip", end.directoryRecords, size)
    89		}
    90		z.r = r
    91		z.File = make([]*File, 0, end.directoryRecords)
    92		z.Comment = end.comment
    93		rs := io.NewSectionReader(r, 0, size)
    94		if _, err = rs.Seek(int64(end.directoryOffset), io.SeekStart); err != nil {
    95			return err
    96		}
    97		buf := bufio.NewReader(rs)
    98	
    99		// The count of files inside a zip is truncated to fit in a uint16.
   100		// Gloss over this by reading headers until we encounter
   101		// a bad one, and then only report an ErrFormat or UnexpectedEOF if
   102		// the file count modulo 65536 is incorrect.
   103		for {
   104			f := &File{zip: z, zipr: r, zipsize: size}
   105			err = readDirectoryHeader(f, buf)
   106			if err == ErrFormat || err == io.ErrUnexpectedEOF {
   107				break
   108			}
   109			if err != nil {
   110				return err
   111			}
   112			z.File = append(z.File, f)
   113		}
   114		if uint16(len(z.File)) != uint16(end.directoryRecords) { // only compare 16 bits here
   115			// Return the readDirectoryHeader error if we read
   116			// the wrong number of directory entries.
   117			return err
   118		}
   119		return nil
   120	}
   121	
   122	// RegisterDecompressor registers or overrides a custom decompressor for a
   123	// specific method ID. If a decompressor for a given method is not found,
   124	// Reader will default to looking up the decompressor at the package level.
   125	func (z *Reader) RegisterDecompressor(method uint16, dcomp Decompressor) {
   126		if z.decompressors == nil {
   127			z.decompressors = make(map[uint16]Decompressor)
   128		}
   129		z.decompressors[method] = dcomp
   130	}
   131	
   132	func (z *Reader) decompressor(method uint16) Decompressor {
   133		dcomp := z.decompressors[method]
   134		if dcomp == nil {
   135			dcomp = decompressor(method)
   136		}
   137		return dcomp
   138	}
   139	
   140	// Close closes the Zip file, rendering it unusable for I/O.
   141	func (rc *ReadCloser) Close() error {
   142		return rc.f.Close()
   143	}
   144	
   145	// DataOffset returns the offset of the file's possibly-compressed
   146	// data, relative to the beginning of the zip file.
   147	//
   148	// Most callers should instead use Open, which transparently
   149	// decompresses data and verifies checksums.
   150	func (f *File) DataOffset() (offset int64, err error) {
   151		bodyOffset, err := f.findBodyOffset()
   152		if err != nil {
   153			return
   154		}
   155		return f.headerOffset + bodyOffset, nil
   156	}
   157	
   158	// Open returns a ReadCloser that provides access to the File's contents.
   159	// Multiple files may be read concurrently.
   160	func (f *File) Open() (io.ReadCloser, error) {
   161		bodyOffset, err := f.findBodyOffset()
   162		if err != nil {
   163			return nil, err
   164		}
   165		size := int64(f.CompressedSize64)
   166		r := io.NewSectionReader(f.zipr, f.headerOffset+bodyOffset, size)
   167		dcomp := f.zip.decompressor(f.Method)
   168		if dcomp == nil {
   169			return nil, ErrAlgorithm
   170		}
   171		var rc io.ReadCloser = dcomp(r)
   172		var desr io.Reader
   173		if f.hasDataDescriptor() {
   174			desr = io.NewSectionReader(f.zipr, f.headerOffset+bodyOffset+size, dataDescriptorLen)
   175		}
   176		rc = &checksumReader{
   177			rc:   rc,
   178			hash: crc32.NewIEEE(),
   179			f:    f,
   180			desr: desr,
   181		}
   182		return rc, nil
   183	}
   184	
   185	type checksumReader struct {
   186		rc    io.ReadCloser
   187		hash  hash.Hash32
   188		nread uint64 // number of bytes read so far
   189		f     *File
   190		desr  io.Reader // if non-nil, where to read the data descriptor
   191		err   error     // sticky error
   192	}
   193	
   194	func (r *checksumReader) Read(b []byte) (n int, err error) {
   195		if r.err != nil {
   196			return 0, r.err
   197		}
   198		n, err = r.rc.Read(b)
   199		r.hash.Write(b[:n])
   200		r.nread += uint64(n)
   201		if err == nil {
   202			return
   203		}
   204		if err == io.EOF {
   205			if r.nread != r.f.UncompressedSize64 {
   206				return 0, io.ErrUnexpectedEOF
   207			}
   208			if r.desr != nil {
   209				if err1 := readDataDescriptor(r.desr, r.f); err1 != nil {
   210					if err1 == io.EOF {
   211						err = io.ErrUnexpectedEOF
   212					} else {
   213						err = err1
   214					}
   215				} else if r.hash.Sum32() != r.f.CRC32 {
   216					err = ErrChecksum
   217				}
   218			} else {
   219				// If there's not a data descriptor, we still compare
   220				// the CRC32 of what we've read against the file header
   221				// or TOC's CRC32, if it seems like it was set.
   222				if r.f.CRC32 != 0 && r.hash.Sum32() != r.f.CRC32 {
   223					err = ErrChecksum
   224				}
   225			}
   226		}
   227		r.err = err
   228		return
   229	}
   230	
   231	func (r *checksumReader) Close() error { return r.rc.Close() }
   232	
   233	// findBodyOffset does the minimum work to verify the file has a header
   234	// and returns the file body offset.
   235	func (f *File) findBodyOffset() (int64, error) {
   236		var buf [fileHeaderLen]byte
   237		if _, err := f.zipr.ReadAt(buf[:], f.headerOffset); err != nil {
   238			return 0, err
   239		}
   240		b := readBuf(buf[:])
   241		if sig := b.uint32(); sig != fileHeaderSignature {
   242			return 0, ErrFormat
   243		}
   244		b = b[22:] // skip over most of the header
   245		filenameLen := int(b.uint16())
   246		extraLen := int(b.uint16())
   247		return int64(fileHeaderLen + filenameLen + extraLen), nil
   248	}
   249	
   250	// readDirectoryHeader attempts to read a directory header from r.
   251	// It returns io.ErrUnexpectedEOF if it cannot read a complete header,
   252	// and ErrFormat if it doesn't find a valid header signature.
   253	func readDirectoryHeader(f *File, r io.Reader) error {
   254		var buf [directoryHeaderLen]byte
   255		if _, err := io.ReadFull(r, buf[:]); err != nil {
   256			return err
   257		}
   258		b := readBuf(buf[:])
   259		if sig := b.uint32(); sig != directoryHeaderSignature {
   260			return ErrFormat
   261		}
   262		f.CreatorVersion = b.uint16()
   263		f.ReaderVersion = b.uint16()
   264		f.Flags = b.uint16()
   265		f.Method = b.uint16()
   266		f.ModifiedTime = b.uint16()
   267		f.ModifiedDate = b.uint16()
   268		f.CRC32 = b.uint32()
   269		f.CompressedSize = b.uint32()
   270		f.UncompressedSize = b.uint32()
   271		f.CompressedSize64 = uint64(f.CompressedSize)
   272		f.UncompressedSize64 = uint64(f.UncompressedSize)
   273		filenameLen := int(b.uint16())
   274		extraLen := int(b.uint16())
   275		commentLen := int(b.uint16())
   276		b = b[4:] // skipped start disk number and internal attributes (2x uint16)
   277		f.ExternalAttrs = b.uint32()
   278		f.headerOffset = int64(b.uint32())
   279		d := make([]byte, filenameLen+extraLen+commentLen)
   280		if _, err := io.ReadFull(r, d); err != nil {
   281			return err
   282		}
   283		f.Name = string(d[:filenameLen])
   284		f.Extra = d[filenameLen : filenameLen+extraLen]
   285		f.Comment = string(d[filenameLen+extraLen:])
   286	
   287		// Determine the character encoding.
   288		utf8Valid1, utf8Require1 := detectUTF8(f.Name)
   289		utf8Valid2, utf8Require2 := detectUTF8(f.Comment)
   290		switch {
   291		case !utf8Valid1 || !utf8Valid2:
   292			// Name and Comment definitely not UTF-8.
   293			f.NonUTF8 = true
   294		case !utf8Require1 && !utf8Require2:
   295			// Name and Comment use only single-byte runes that overlap with UTF-8.
   296			f.NonUTF8 = false
   297		default:
   298			// Might be UTF-8, might be some other encoding; preserve existing flag.
   299			// Some ZIP writers use UTF-8 encoding without setting the UTF-8 flag.
   300			// Since it is impossible to always distinguish valid UTF-8 from some
   301			// other encoding (e.g., GBK or Shift-JIS), we trust the flag.
   302			f.NonUTF8 = f.Flags&0x800 == 0
   303		}
   304	
   305		needUSize := f.UncompressedSize == ^uint32(0)
   306		needCSize := f.CompressedSize == ^uint32(0)
   307		needHeaderOffset := f.headerOffset == int64(^uint32(0))
   308	
   309		// Best effort to find what we need.
   310		// Other zip authors might not even follow the basic format,
   311		// and we'll just ignore the Extra content in that case.
   312		var modified time.Time
   313	parseExtras:
   314		for extra := readBuf(f.Extra); len(extra) >= 4; { // need at least tag and size
   315			fieldTag := extra.uint16()
   316			fieldSize := int(extra.uint16())
   317			if len(extra) < fieldSize {
   318				break
   319			}
   320			fieldBuf := extra.sub(fieldSize)
   321	
   322			switch fieldTag {
   323			case zip64ExtraID:
   324				// update directory values from the zip64 extra block.
   325				// They should only be consulted if the sizes read earlier
   326				// are maxed out.
   327				// See golang.org/issue/13367.
   328				if needUSize {
   329					needUSize = false
   330					if len(fieldBuf) < 8 {
   331						return ErrFormat
   332					}
   333					f.UncompressedSize64 = fieldBuf.uint64()
   334				}
   335				if needCSize {
   336					needCSize = false
   337					if len(fieldBuf) < 8 {
   338						return ErrFormat
   339					}
   340					f.CompressedSize64 = fieldBuf.uint64()
   341				}
   342				if needHeaderOffset {
   343					needHeaderOffset = false
   344					if len(fieldBuf) < 8 {
   345						return ErrFormat
   346					}
   347					f.headerOffset = int64(fieldBuf.uint64())
   348				}
   349			case ntfsExtraID:
   350				if len(fieldBuf) < 4 {
   351					continue parseExtras
   352				}
   353				fieldBuf.uint32()        // reserved (ignored)
   354				for len(fieldBuf) >= 4 { // need at least tag and size
   355					attrTag := fieldBuf.uint16()
   356					attrSize := int(fieldBuf.uint16())
   357					if len(fieldBuf) < attrSize {
   358						continue parseExtras
   359					}
   360					attrBuf := fieldBuf.sub(attrSize)
   361					if attrTag != 1 || attrSize != 24 {
   362						continue // Ignore irrelevant attributes
   363					}
   364	
   365					const ticksPerSecond = 1e7    // Windows timestamp resolution
   366					ts := int64(attrBuf.uint64()) // ModTime since Windows epoch
   367					secs := int64(ts / ticksPerSecond)
   368					nsecs := (1e9 / ticksPerSecond) * int64(ts%ticksPerSecond)
   369					epoch := time.Date(1601, time.January, 1, 0, 0, 0, 0, time.UTC)
   370					modified = time.Unix(epoch.Unix()+secs, nsecs)
   371				}
   372			case unixExtraID, infoZipUnixExtraID:
   373				if len(fieldBuf) < 8 {
   374					continue parseExtras
   375				}
   376				fieldBuf.uint32()              // AcTime (ignored)
   377				ts := int64(fieldBuf.uint32()) // ModTime since Unix epoch
   378				modified = time.Unix(ts, 0)
   379			case extTimeExtraID:
   380				if len(fieldBuf) < 5 || fieldBuf.uint8()&1 == 0 {
   381					continue parseExtras
   382				}
   383				ts := int64(fieldBuf.uint32()) // ModTime since Unix epoch
   384				modified = time.Unix(ts, 0)
   385			}
   386		}
   387	
   388		msdosModified := msDosTimeToTime(f.ModifiedDate, f.ModifiedTime)
   389		f.Modified = msdosModified
   390		if !modified.IsZero() {
   391			f.Modified = modified.UTC()
   392	
   393			// If legacy MS-DOS timestamps are set, we can use the delta between
   394			// the legacy and extended versions to estimate timezone offset.
   395			//
   396			// A non-UTC timezone is always used (even if offset is zero).
   397			// Thus, FileHeader.Modified.Location() == time.UTC is useful for
   398			// determining whether extended timestamps are present.
   399			// This is necessary for users that need to do additional time
   400			// calculations when dealing with legacy ZIP formats.
   401			if f.ModifiedTime != 0 || f.ModifiedDate != 0 {
   402				f.Modified = modified.In(timeZone(msdosModified.Sub(modified)))
   403			}
   404		}
   405	
   406		// Assume that uncompressed size 2³²-1 could plausibly happen in
   407		// an old zip32 file that was sharding inputs into the largest chunks
   408		// possible (or is just malicious; search the web for 42.zip).
   409		// If needUSize is true still, it means we didn't see a zip64 extension.
   410		// As long as the compressed size is not also 2³²-1 (implausible)
   411		// and the header is not also 2³²-1 (equally implausible),
   412		// accept the uncompressed size 2³²-1 as valid.
   413		// If nothing else, this keeps archive/zip working with 42.zip.
   414		_ = needUSize
   415	
   416		if needCSize || needHeaderOffset {
   417			return ErrFormat
   418		}
   419	
   420		return nil
   421	}
   422	
   423	func readDataDescriptor(r io.Reader, f *File) error {
   424		var buf [dataDescriptorLen]byte
   425	
   426		// The spec says: "Although not originally assigned a
   427		// signature, the value 0x08074b50 has commonly been adopted
   428		// as a signature value for the data descriptor record.
   429		// Implementers should be aware that ZIP files may be
   430		// encountered with or without this signature marking data
   431		// descriptors and should account for either case when reading
   432		// ZIP files to ensure compatibility."
   433		//
   434		// dataDescriptorLen includes the size of the signature but
   435		// first read just those 4 bytes to see if it exists.
   436		if _, err := io.ReadFull(r, buf[:4]); err != nil {
   437			return err
   438		}
   439		off := 0
   440		maybeSig := readBuf(buf[:4])
   441		if maybeSig.uint32() != dataDescriptorSignature {
   442			// No data descriptor signature. Keep these four
   443			// bytes.
   444			off += 4
   445		}
   446		if _, err := io.ReadFull(r, buf[off:12]); err != nil {
   447			return err
   448		}
   449		b := readBuf(buf[:12])
   450		if b.uint32() != f.CRC32 {
   451			return ErrChecksum
   452		}
   453	
   454		// The two sizes that follow here can be either 32 bits or 64 bits
   455		// but the spec is not very clear on this and different
   456		// interpretations has been made causing incompatibilities. We
   457		// already have the sizes from the central directory so we can
   458		// just ignore these.
   459	
   460		return nil
   461	}
   462	
   463	func readDirectoryEnd(r io.ReaderAt, size int64) (dir *directoryEnd, err error) {
   464		// look for directoryEndSignature in the last 1k, then in the last 65k
   465		var buf []byte
   466		var directoryEndOffset int64
   467		for i, bLen := range []int64{1024, 65 * 1024} {
   468			if bLen > size {
   469				bLen = size
   470			}
   471			buf = make([]byte, int(bLen))
   472			if _, err := r.ReadAt(buf, size-bLen); err != nil && err != io.EOF {
   473				return nil, err
   474			}
   475			if p := findSignatureInBlock(buf); p >= 0 {
   476				buf = buf[p:]
   477				directoryEndOffset = size - bLen + int64(p)
   478				break
   479			}
   480			if i == 1 || bLen == size {
   481				return nil, ErrFormat
   482			}
   483		}
   484	
   485		// read header into struct
   486		b := readBuf(buf[4:]) // skip signature
   487		d := &directoryEnd{
   488			diskNbr:            uint32(b.uint16()),
   489			dirDiskNbr:         uint32(b.uint16()),
   490			dirRecordsThisDisk: uint64(b.uint16()),
   491			directoryRecords:   uint64(b.uint16()),
   492			directorySize:      uint64(b.uint32()),
   493			directoryOffset:    uint64(b.uint32()),
   494			commentLen:         b.uint16(),
   495		}
   496		l := int(d.commentLen)
   497		if l > len(b) {
   498			return nil, errors.New("zip: invalid comment length")
   499		}
   500		d.comment = string(b[:l])
   501	
   502		// These values mean that the file can be a zip64 file
   503		if d.directoryRecords == 0xffff || d.directorySize == 0xffff || d.directoryOffset == 0xffffffff {
   504			p, err := findDirectory64End(r, directoryEndOffset)
   505			if err == nil && p >= 0 {
   506				err = readDirectory64End(r, p, d)
   507			}
   508			if err != nil {
   509				return nil, err
   510			}
   511		}
   512		// Make sure directoryOffset points to somewhere in our file.
   513		if o := int64(d.directoryOffset); o < 0 || o >= size {
   514			return nil, ErrFormat
   515		}
   516		return d, nil
   517	}
   518	
   519	// findDirectory64End tries to read the zip64 locator just before the
   520	// directory end and returns the offset of the zip64 directory end if
   521	// found.
   522	func findDirectory64End(r io.ReaderAt, directoryEndOffset int64) (int64, error) {
   523		locOffset := directoryEndOffset - directory64LocLen
   524		if locOffset < 0 {
   525			return -1, nil // no need to look for a header outside the file
   526		}
   527		buf := make([]byte, directory64LocLen)
   528		if _, err := r.ReadAt(buf, locOffset); err != nil {
   529			return -1, err
   530		}
   531		b := readBuf(buf)
   532		if sig := b.uint32(); sig != directory64LocSignature {
   533			return -1, nil
   534		}
   535		if b.uint32() != 0 { // number of the disk with the start of the zip64 end of central directory
   536			return -1, nil // the file is not a valid zip64-file
   537		}
   538		p := b.uint64()      // relative offset of the zip64 end of central directory record
   539		if b.uint32() != 1 { // total number of disks
   540			return -1, nil // the file is not a valid zip64-file
   541		}
   542		return int64(p), nil
   543	}
   544	
   545	// readDirectory64End reads the zip64 directory end and updates the
   546	// directory end with the zip64 directory end values.
   547	func readDirectory64End(r io.ReaderAt, offset int64, d *directoryEnd) (err error) {
   548		buf := make([]byte, directory64EndLen)
   549		if _, err := r.ReadAt(buf, offset); err != nil {
   550			return err
   551		}
   552	
   553		b := readBuf(buf)
   554		if sig := b.uint32(); sig != directory64EndSignature {
   555			return ErrFormat
   556		}
   557	
   558		b = b[12:]                        // skip dir size, version and version needed (uint64 + 2x uint16)
   559		d.diskNbr = b.uint32()            // number of this disk
   560		d.dirDiskNbr = b.uint32()         // number of the disk with the start of the central directory
   561		d.dirRecordsThisDisk = b.uint64() // total number of entries in the central directory on this disk
   562		d.directoryRecords = b.uint64()   // total number of entries in the central directory
   563		d.directorySize = b.uint64()      // size of the central directory
   564		d.directoryOffset = b.uint64()    // offset of start of central directory with respect to the starting disk number
   565	
   566		return nil
   567	}
   568	
   569	func findSignatureInBlock(b []byte) int {
   570		for i := len(b) - directoryEndLen; i >= 0; i-- {
   571			// defined from directoryEndSignature in struct.go
   572			if b[i] == 'P' && b[i+1] == 'K' && b[i+2] == 0x05 && b[i+3] == 0x06 {
   573				// n is length of comment
   574				n := int(b[i+directoryEndLen-2]) | int(b[i+directoryEndLen-1])<<8
   575				if n+directoryEndLen+i <= len(b) {
   576					return i
   577				}
   578			}
   579		}
   580		return -1
   581	}
   582	
   583	type readBuf []byte
   584	
   585	func (b *readBuf) uint8() uint8 {
   586		v := (*b)[0]
   587		*b = (*b)[1:]
   588		return v
   589	}
   590	
   591	func (b *readBuf) uint16() uint16 {
   592		v := binary.LittleEndian.Uint16(*b)
   593		*b = (*b)[2:]
   594		return v
   595	}
   596	
   597	func (b *readBuf) uint32() uint32 {
   598		v := binary.LittleEndian.Uint32(*b)
   599		*b = (*b)[4:]
   600		return v
   601	}
   602	
   603	func (b *readBuf) uint64() uint64 {
   604		v := binary.LittleEndian.Uint64(*b)
   605		*b = (*b)[8:]
   606		return v
   607	}
   608	
   609	func (b *readBuf) sub(n int) readBuf {
   610		b2 := (*b)[:n]
   611		*b = (*b)[n:]
   612		return b2
   613	}
   614
View as plain text