Source file src/compress/lzw/reader.go

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package lzw implements the Lempel-Ziv-Welch compressed data format,
     6	// described in T. A. Welch, ``A Technique for High-Performance Data
     7	// Compression'', Computer, 17(6) (June 1984), pp 8-19.
     8	//
     9	// In particular, it implements LZW as used by the GIF and PDF file
    10	// formats, which means variable-width codes up to 12 bits and the first
    11	// two non-literal codes are a clear code and an EOF code.
    12	//
    13	// The TIFF file format uses a similar but incompatible version of the LZW
    14	// algorithm. See the golang.org/x/image/tiff/lzw package for an
    15	// implementation.
    16	package lzw
    17	
    18	// TODO(nigeltao): check that PDF uses LZW in the same way as GIF,
    19	// modulo LSB/MSB packing order.
    20	
    21	import (
    22		"bufio"
    23		"errors"
    24		"fmt"
    25		"io"
    26	)
    27	
    28	// Order specifies the bit ordering in an LZW data stream.
    29	type Order int
    30	
    31	const (
    32		// LSB means Least Significant Bits first, as used in the GIF file format.
    33		LSB Order = iota
    34		// MSB means Most Significant Bits first, as used in the TIFF and PDF
    35		// file formats.
    36		MSB
    37	)
    38	
    39	const (
    40		maxWidth           = 12
    41		decoderInvalidCode = 0xffff
    42		flushBuffer        = 1 << maxWidth
    43	)
    44	
    45	// decoder is the state from which the readXxx method converts a byte
    46	// stream into a code stream.
    47	type decoder struct {
    48		r        io.ByteReader
    49		bits     uint32
    50		nBits    uint
    51		width    uint
    52		read     func(*decoder) (uint16, error) // readLSB or readMSB
    53		litWidth int                            // width in bits of literal codes
    54		err      error
    55	
    56		// The first 1<<litWidth codes are literal codes.
    57		// The next two codes mean clear and EOF.
    58		// Other valid codes are in the range [lo, hi] where lo := clear + 2,
    59		// with the upper bound incrementing on each code seen.
    60		//
    61		// overflow is the code at which hi overflows the code width. It always
    62		// equals 1 << width.
    63		//
    64		// last is the most recently seen code, or decoderInvalidCode.
    65		//
    66		// An invariant is that
    67		// (hi < overflow) || (hi == overflow && last == decoderInvalidCode)
    68		clear, eof, hi, overflow, last uint16
    69	
    70		// Each code c in [lo, hi] expands to two or more bytes. For c != hi:
    71		//   suffix[c] is the last of these bytes.
    72		//   prefix[c] is the code for all but the last byte.
    73		//   This code can either be a literal code or another code in [lo, c).
    74		// The c == hi case is a special case.
    75		suffix [1 << maxWidth]uint8
    76		prefix [1 << maxWidth]uint16
    77	
    78		// output is the temporary output buffer.
    79		// Literal codes are accumulated from the start of the buffer.
    80		// Non-literal codes decode to a sequence of suffixes that are first
    81		// written right-to-left from the end of the buffer before being copied
    82		// to the start of the buffer.
    83		// It is flushed when it contains >= 1<<maxWidth bytes,
    84		// so that there is always room to decode an entire code.
    85		output [2 * 1 << maxWidth]byte
    86		o      int    // write index into output
    87		toRead []byte // bytes to return from Read
    88	}
    89	
    90	// readLSB returns the next code for "Least Significant Bits first" data.
    91	func (d *decoder) readLSB() (uint16, error) {
    92		for d.nBits < d.width {
    93			x, err := d.r.ReadByte()
    94			if err != nil {
    95				return 0, err
    96			}
    97			d.bits |= uint32(x) << d.nBits
    98			d.nBits += 8
    99		}
   100		code := uint16(d.bits & (1<<d.width - 1))
   101		d.bits >>= d.width
   102		d.nBits -= d.width
   103		return code, nil
   104	}
   105	
   106	// readMSB returns the next code for "Most Significant Bits first" data.
   107	func (d *decoder) readMSB() (uint16, error) {
   108		for d.nBits < d.width {
   109			x, err := d.r.ReadByte()
   110			if err != nil {
   111				return 0, err
   112			}
   113			d.bits |= uint32(x) << (24 - d.nBits)
   114			d.nBits += 8
   115		}
   116		code := uint16(d.bits >> (32 - d.width))
   117		d.bits <<= d.width
   118		d.nBits -= d.width
   119		return code, nil
   120	}
   121	
   122	func (d *decoder) Read(b []byte) (int, error) {
   123		for {
   124			if len(d.toRead) > 0 {
   125				n := copy(b, d.toRead)
   126				d.toRead = d.toRead[n:]
   127				return n, nil
   128			}
   129			if d.err != nil {
   130				return 0, d.err
   131			}
   132			d.decode()
   133		}
   134	}
   135	
   136	// decode decompresses bytes from r and leaves them in d.toRead.
   137	// read specifies how to decode bytes into codes.
   138	// litWidth is the width in bits of literal codes.
   139	func (d *decoder) decode() {
   140		// Loop over the code stream, converting codes into decompressed bytes.
   141	loop:
   142		for {
   143			code, err := d.read(d)
   144			if err != nil {
   145				if err == io.EOF {
   146					err = io.ErrUnexpectedEOF
   147				}
   148				d.err = err
   149				break
   150			}
   151			switch {
   152			case code < d.clear:
   153				// We have a literal code.
   154				d.output[d.o] = uint8(code)
   155				d.o++
   156				if d.last != decoderInvalidCode {
   157					// Save what the hi code expands to.
   158					d.suffix[d.hi] = uint8(code)
   159					d.prefix[d.hi] = d.last
   160				}
   161			case code == d.clear:
   162				d.width = 1 + uint(d.litWidth)
   163				d.hi = d.eof
   164				d.overflow = 1 << d.width
   165				d.last = decoderInvalidCode
   166				continue
   167			case code == d.eof:
   168				d.err = io.EOF
   169				break loop
   170			case code <= d.hi:
   171				c, i := code, len(d.output)-1
   172				if code == d.hi && d.last != decoderInvalidCode {
   173					// code == hi is a special case which expands to the last expansion
   174					// followed by the head of the last expansion. To find the head, we walk
   175					// the prefix chain until we find a literal code.
   176					c = d.last
   177					for c >= d.clear {
   178						c = d.prefix[c]
   179					}
   180					d.output[i] = uint8(c)
   181					i--
   182					c = d.last
   183				}
   184				// Copy the suffix chain into output and then write that to w.
   185				for c >= d.clear {
   186					d.output[i] = d.suffix[c]
   187					i--
   188					c = d.prefix[c]
   189				}
   190				d.output[i] = uint8(c)
   191				d.o += copy(d.output[d.o:], d.output[i:])
   192				if d.last != decoderInvalidCode {
   193					// Save what the hi code expands to.
   194					d.suffix[d.hi] = uint8(c)
   195					d.prefix[d.hi] = d.last
   196				}
   197			default:
   198				d.err = errors.New("lzw: invalid code")
   199				break loop
   200			}
   201			d.last, d.hi = code, d.hi+1
   202			if d.hi >= d.overflow {
   203				if d.width == maxWidth {
   204					d.last = decoderInvalidCode
   205					// Undo the d.hi++ a few lines above, so that (1) we maintain
   206					// the invariant that d.hi <= d.overflow, and (2) d.hi does not
   207					// eventually overflow a uint16.
   208					d.hi--
   209				} else {
   210					d.width++
   211					d.overflow <<= 1
   212				}
   213			}
   214			if d.o >= flushBuffer {
   215				break
   216			}
   217		}
   218		// Flush pending output.
   219		d.toRead = d.output[:d.o]
   220		d.o = 0
   221	}
   222	
   223	var errClosed = errors.New("lzw: reader/writer is closed")
   224	
   225	func (d *decoder) Close() error {
   226		d.err = errClosed // in case any Reads come along
   227		return nil
   228	}
   229	
   230	// NewReader creates a new io.ReadCloser.
   231	// Reads from the returned io.ReadCloser read and decompress data from r.
   232	// If r does not also implement io.ByteReader,
   233	// the decompressor may read more data than necessary from r.
   234	// It is the caller's responsibility to call Close on the ReadCloser when
   235	// finished reading.
   236	// The number of bits to use for literal codes, litWidth, must be in the
   237	// range [2,8] and is typically 8. It must equal the litWidth
   238	// used during compression.
   239	func NewReader(r io.Reader, order Order, litWidth int) io.ReadCloser {
   240		d := new(decoder)
   241		switch order {
   242		case LSB:
   243			d.read = (*decoder).readLSB
   244		case MSB:
   245			d.read = (*decoder).readMSB
   246		default:
   247			d.err = errors.New("lzw: unknown order")
   248			return d
   249		}
   250		if litWidth < 2 || 8 < litWidth {
   251			d.err = fmt.Errorf("lzw: litWidth %d out of range", litWidth)
   252			return d
   253		}
   254		if br, ok := r.(io.ByteReader); ok {
   255			d.r = br
   256		} else {
   257			d.r = bufio.NewReader(r)
   258		}
   259		d.litWidth = litWidth
   260		d.width = 1 + uint(litWidth)
   261		d.clear = uint16(1) << uint(litWidth)
   262		d.eof, d.hi = d.clear+1, d.clear+1
   263		d.overflow = uint16(1) << d.width
   264		d.last = decoderInvalidCode
   265	
   266		return d
   267	}
   268
View as plain text