Source file src/compress/bzip2/bzip2.go

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package bzip2 implements bzip2 decompression.
     6	package bzip2
     7	
     8	import "io"
     9	
    10	// There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
    11	// of guessing: https://en.wikipedia.org/wiki/Bzip2
    12	// The source code to pyflate was useful for debugging:
    13	// http://www.paul.sladen.org/projects/pyflate
    14	
    15	// A StructuralError is returned when the bzip2 data is found to be
    16	// syntactically invalid.
    17	type StructuralError string
    18	
    19	func (s StructuralError) Error() string {
    20		return "bzip2 data invalid: " + string(s)
    21	}
    22	
    23	// A reader decompresses bzip2 compressed data.
    24	type reader struct {
    25		br           bitReader
    26		fileCRC      uint32
    27		blockCRC     uint32
    28		wantBlockCRC uint32
    29		setupDone    bool // true if we have parsed the bzip2 header.
    30		blockSize    int  // blockSize in bytes, i.e. 900 * 1000.
    31		eof          bool
    32		c            [256]uint // the `C' array for the inverse BWT.
    33		tt           []uint32  // mirrors the `tt' array in the bzip2 source and contains the P array in the upper 24 bits.
    34		tPos         uint32    // Index of the next output byte in tt.
    35	
    36		preRLE      []uint32 // contains the RLE data still to be processed.
    37		preRLEUsed  int      // number of entries of preRLE used.
    38		lastByte    int      // the last byte value seen.
    39		byteRepeats uint     // the number of repeats of lastByte seen.
    40		repeats     uint     // the number of copies of lastByte to output.
    41	}
    42	
    43	// NewReader returns an io.Reader which decompresses bzip2 data from r.
    44	// If r does not also implement io.ByteReader,
    45	// the decompressor may read more data than necessary from r.
    46	func NewReader(r io.Reader) io.Reader {
    47		bz2 := new(reader)
    48		bz2.br = newBitReader(r)
    49		return bz2
    50	}
    51	
    52	const bzip2FileMagic = 0x425a // "BZ"
    53	const bzip2BlockMagic = 0x314159265359
    54	const bzip2FinalMagic = 0x177245385090
    55	
    56	// setup parses the bzip2 header.
    57	func (bz2 *reader) setup(needMagic bool) error {
    58		br := &bz2.br
    59	
    60		if needMagic {
    61			magic := br.ReadBits(16)
    62			if magic != bzip2FileMagic {
    63				return StructuralError("bad magic value")
    64			}
    65		}
    66	
    67		t := br.ReadBits(8)
    68		if t != 'h' {
    69			return StructuralError("non-Huffman entropy encoding")
    70		}
    71	
    72		level := br.ReadBits(8)
    73		if level < '1' || level > '9' {
    74			return StructuralError("invalid compression level")
    75		}
    76	
    77		bz2.fileCRC = 0
    78		bz2.blockSize = 100 * 1000 * (level - '0')
    79		if bz2.blockSize > len(bz2.tt) {
    80			bz2.tt = make([]uint32, bz2.blockSize)
    81		}
    82		return nil
    83	}
    84	
    85	func (bz2 *reader) Read(buf []byte) (n int, err error) {
    86		if bz2.eof {
    87			return 0, io.EOF
    88		}
    89	
    90		if !bz2.setupDone {
    91			err = bz2.setup(true)
    92			brErr := bz2.br.Err()
    93			if brErr != nil {
    94				err = brErr
    95			}
    96			if err != nil {
    97				return 0, err
    98			}
    99			bz2.setupDone = true
   100		}
   101	
   102		n, err = bz2.read(buf)
   103		brErr := bz2.br.Err()
   104		if brErr != nil {
   105			err = brErr
   106		}
   107		return
   108	}
   109	
   110	func (bz2 *reader) readFromBlock(buf []byte) int {
   111		// bzip2 is a block based compressor, except that it has a run-length
   112		// preprocessing step. The block based nature means that we can
   113		// preallocate fixed-size buffers and reuse them. However, the RLE
   114		// preprocessing would require allocating huge buffers to store the
   115		// maximum expansion. Thus we process blocks all at once, except for
   116		// the RLE which we decompress as required.
   117		n := 0
   118		for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
   119			// We have RLE data pending.
   120	
   121			// The run-length encoding works like this:
   122			// Any sequence of four equal bytes is followed by a length
   123			// byte which contains the number of repeats of that byte to
   124			// include. (The number of repeats can be zero.) Because we are
   125			// decompressing on-demand our state is kept in the reader
   126			// object.
   127	
   128			if bz2.repeats > 0 {
   129				buf[n] = byte(bz2.lastByte)
   130				n++
   131				bz2.repeats--
   132				if bz2.repeats == 0 {
   133					bz2.lastByte = -1
   134				}
   135				continue
   136			}
   137	
   138			bz2.tPos = bz2.preRLE[bz2.tPos]
   139			b := byte(bz2.tPos)
   140			bz2.tPos >>= 8
   141			bz2.preRLEUsed++
   142	
   143			if bz2.byteRepeats == 3 {
   144				bz2.repeats = uint(b)
   145				bz2.byteRepeats = 0
   146				continue
   147			}
   148	
   149			if bz2.lastByte == int(b) {
   150				bz2.byteRepeats++
   151			} else {
   152				bz2.byteRepeats = 0
   153			}
   154			bz2.lastByte = int(b)
   155	
   156			buf[n] = b
   157			n++
   158		}
   159	
   160		return n
   161	}
   162	
   163	func (bz2 *reader) read(buf []byte) (int, error) {
   164		for {
   165			n := bz2.readFromBlock(buf)
   166			if n > 0 || len(buf) == 0 {
   167				bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n])
   168				return n, nil
   169			}
   170	
   171			// End of block. Check CRC.
   172			if bz2.blockCRC != bz2.wantBlockCRC {
   173				bz2.br.err = StructuralError("block checksum mismatch")
   174				return 0, bz2.br.err
   175			}
   176	
   177			// Find next block.
   178			br := &bz2.br
   179			switch br.ReadBits64(48) {
   180			default:
   181				return 0, StructuralError("bad magic value found")
   182	
   183			case bzip2BlockMagic:
   184				// Start of block.
   185				err := bz2.readBlock()
   186				if err != nil {
   187					return 0, err
   188				}
   189	
   190			case bzip2FinalMagic:
   191				// Check end-of-file CRC.
   192				wantFileCRC := uint32(br.ReadBits64(32))
   193				if br.err != nil {
   194					return 0, br.err
   195				}
   196				if bz2.fileCRC != wantFileCRC {
   197					br.err = StructuralError("file checksum mismatch")
   198					return 0, br.err
   199				}
   200	
   201				// Skip ahead to byte boundary.
   202				// Is there a file concatenated to this one?
   203				// It would start with BZ.
   204				if br.bits%8 != 0 {
   205					br.ReadBits(br.bits % 8)
   206				}
   207				b, err := br.r.ReadByte()
   208				if err == io.EOF {
   209					br.err = io.EOF
   210					bz2.eof = true
   211					return 0, io.EOF
   212				}
   213				if err != nil {
   214					br.err = err
   215					return 0, err
   216				}
   217				z, err := br.r.ReadByte()
   218				if err != nil {
   219					if err == io.EOF {
   220						err = io.ErrUnexpectedEOF
   221					}
   222					br.err = err
   223					return 0, err
   224				}
   225				if b != 'B' || z != 'Z' {
   226					return 0, StructuralError("bad magic value in continuation file")
   227				}
   228				if err := bz2.setup(false); err != nil {
   229					return 0, err
   230				}
   231			}
   232		}
   233	}
   234	
   235	// readBlock reads a bzip2 block. The magic number should already have been consumed.
   236	func (bz2 *reader) readBlock() (err error) {
   237		br := &bz2.br
   238		bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is.
   239		bz2.blockCRC = 0
   240		bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC
   241		randomized := br.ReadBits(1)
   242		if randomized != 0 {
   243			return StructuralError("deprecated randomized files")
   244		}
   245		origPtr := uint(br.ReadBits(24))
   246	
   247		// If not every byte value is used in the block (i.e., it's text) then
   248		// the symbol set is reduced. The symbols used are stored as a
   249		// two-level, 16x16 bitmap.
   250		symbolRangeUsedBitmap := br.ReadBits(16)
   251		symbolPresent := make([]bool, 256)
   252		numSymbols := 0
   253		for symRange := uint(0); symRange < 16; symRange++ {
   254			if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
   255				bits := br.ReadBits(16)
   256				for symbol := uint(0); symbol < 16; symbol++ {
   257					if bits&(1<<(15-symbol)) != 0 {
   258						symbolPresent[16*symRange+symbol] = true
   259						numSymbols++
   260					}
   261				}
   262			}
   263		}
   264	
   265		if numSymbols == 0 {
   266			// There must be an EOF symbol.
   267			return StructuralError("no symbols in input")
   268		}
   269	
   270		// A block uses between two and six different Huffman trees.
   271		numHuffmanTrees := br.ReadBits(3)
   272		if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
   273			return StructuralError("invalid number of Huffman trees")
   274		}
   275	
   276		// The Huffman tree can switch every 50 symbols so there's a list of
   277		// tree indexes telling us which tree to use for each 50 symbol block.
   278		numSelectors := br.ReadBits(15)
   279		treeIndexes := make([]uint8, numSelectors)
   280	
   281		// The tree indexes are move-to-front transformed and stored as unary
   282		// numbers.
   283		mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
   284		for i := range treeIndexes {
   285			c := 0
   286			for {
   287				inc := br.ReadBits(1)
   288				if inc == 0 {
   289					break
   290				}
   291				c++
   292			}
   293			if c >= numHuffmanTrees {
   294				return StructuralError("tree index too large")
   295			}
   296			treeIndexes[i] = mtfTreeDecoder.Decode(c)
   297		}
   298	
   299		// The list of symbols for the move-to-front transform is taken from
   300		// the previously decoded symbol bitmap.
   301		symbols := make([]byte, numSymbols)
   302		nextSymbol := 0
   303		for i := 0; i < 256; i++ {
   304			if symbolPresent[i] {
   305				symbols[nextSymbol] = byte(i)
   306				nextSymbol++
   307			}
   308		}
   309		mtf := newMTFDecoder(symbols)
   310	
   311		numSymbols += 2 // to account for RUNA and RUNB symbols
   312		huffmanTrees := make([]huffmanTree, numHuffmanTrees)
   313	
   314		// Now we decode the arrays of code-lengths for each tree.
   315		lengths := make([]uint8, numSymbols)
   316		for i := range huffmanTrees {
   317			// The code lengths are delta encoded from a 5-bit base value.
   318			length := br.ReadBits(5)
   319			for j := range lengths {
   320				for {
   321					if length < 1 || length > 20 {
   322						return StructuralError("Huffman length out of range")
   323					}
   324					if !br.ReadBit() {
   325						break
   326					}
   327					if br.ReadBit() {
   328						length--
   329					} else {
   330						length++
   331					}
   332				}
   333				lengths[j] = uint8(length)
   334			}
   335			huffmanTrees[i], err = newHuffmanTree(lengths)
   336			if err != nil {
   337				return err
   338			}
   339		}
   340	
   341		selectorIndex := 1 // the next tree index to use
   342		if len(treeIndexes) == 0 {
   343			return StructuralError("no tree selectors given")
   344		}
   345		if int(treeIndexes[0]) >= len(huffmanTrees) {
   346			return StructuralError("tree selector out of range")
   347		}
   348		currentHuffmanTree := huffmanTrees[treeIndexes[0]]
   349		bufIndex := 0 // indexes bz2.buf, the output buffer.
   350		// The output of the move-to-front transform is run-length encoded and
   351		// we merge the decoding into the Huffman parsing loop. These two
   352		// variables accumulate the repeat count. See the Wikipedia page for
   353		// details.
   354		repeat := 0
   355		repeatPower := 0
   356	
   357		// The `C' array (used by the inverse BWT) needs to be zero initialized.
   358		for i := range bz2.c {
   359			bz2.c[i] = 0
   360		}
   361	
   362		decoded := 0 // counts the number of symbols decoded by the current tree.
   363		for {
   364			if decoded == 50 {
   365				if selectorIndex >= numSelectors {
   366					return StructuralError("insufficient selector indices for number of symbols")
   367				}
   368				if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) {
   369					return StructuralError("tree selector out of range")
   370				}
   371				currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
   372				selectorIndex++
   373				decoded = 0
   374			}
   375	
   376			v := currentHuffmanTree.Decode(br)
   377			decoded++
   378	
   379			if v < 2 {
   380				// This is either the RUNA or RUNB symbol.
   381				if repeat == 0 {
   382					repeatPower = 1
   383				}
   384				repeat += repeatPower << v
   385				repeatPower <<= 1
   386	
   387				// This limit of 2 million comes from the bzip2 source
   388				// code. It prevents repeat from overflowing.
   389				if repeat > 2*1024*1024 {
   390					return StructuralError("repeat count too large")
   391				}
   392				continue
   393			}
   394	
   395			if repeat > 0 {
   396				// We have decoded a complete run-length so we need to
   397				// replicate the last output symbol.
   398				if repeat > bz2.blockSize-bufIndex {
   399					return StructuralError("repeats past end of block")
   400				}
   401				for i := 0; i < repeat; i++ {
   402					b := mtf.First()
   403					bz2.tt[bufIndex] = uint32(b)
   404					bz2.c[b]++
   405					bufIndex++
   406				}
   407				repeat = 0
   408			}
   409	
   410			if int(v) == numSymbols-1 {
   411				// This is the EOF symbol. Because it's always at the
   412				// end of the move-to-front list, and never gets moved
   413				// to the front, it has this unique value.
   414				break
   415			}
   416	
   417			// Since two metasymbols (RUNA and RUNB) have values 0 and 1,
   418			// one would expect |v-2| to be passed to the MTF decoder.
   419			// However, the front of the MTF list is never referenced as 0,
   420			// it's always referenced with a run-length of 1. Thus 0
   421			// doesn't need to be encoded and we have |v-1| in the next
   422			// line.
   423			b := mtf.Decode(int(v - 1))
   424			if bufIndex >= bz2.blockSize {
   425				return StructuralError("data exceeds block size")
   426			}
   427			bz2.tt[bufIndex] = uint32(b)
   428			bz2.c[b]++
   429			bufIndex++
   430		}
   431	
   432		if origPtr >= uint(bufIndex) {
   433			return StructuralError("origPtr out of bounds")
   434		}
   435	
   436		// We have completed the entropy decoding. Now we can perform the
   437		// inverse BWT and setup the RLE buffer.
   438		bz2.preRLE = bz2.tt[:bufIndex]
   439		bz2.preRLEUsed = 0
   440		bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
   441		bz2.lastByte = -1
   442		bz2.byteRepeats = 0
   443		bz2.repeats = 0
   444	
   445		return nil
   446	}
   447	
   448	// inverseBWT implements the inverse Burrows-Wheeler transform as described in
   449	// http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
   450	// In that document, origPtr is called `I' and c is the `C' array after the
   451	// first pass over the data. It's an argument here because we merge the first
   452	// pass with the Huffman decoding.
   453	//
   454	// This also implements the `single array' method from the bzip2 source code
   455	// which leaves the output, still shuffled, in the bottom 8 bits of tt with the
   456	// index of the next byte in the top 24-bits. The index of the first byte is
   457	// returned.
   458	func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
   459		sum := uint(0)
   460		for i := 0; i < 256; i++ {
   461			sum += c[i]
   462			c[i] = sum - c[i]
   463		}
   464	
   465		for i := range tt {
   466			b := tt[i] & 0xff
   467			tt[c[b]] |= uint32(i) << 8
   468			c[b]++
   469		}
   470	
   471		return tt[origPtr] >> 8
   472	}
   473	
   474	// This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed,
   475	// causing the bits in the input to be processed in the reverse of the usual order.
   476	
   477	var crctab [256]uint32
   478	
   479	func init() {
   480		const poly = 0x04C11DB7
   481		for i := range crctab {
   482			crc := uint32(i) << 24
   483			for j := 0; j < 8; j++ {
   484				if crc&0x80000000 != 0 {
   485					crc = (crc << 1) ^ poly
   486				} else {
   487					crc <<= 1
   488				}
   489			}
   490			crctab[i] = crc
   491		}
   492	}
   493	
   494	// updateCRC updates the crc value to incorporate the data in b.
   495	// The initial value is 0.
   496	func updateCRC(val uint32, b []byte) uint32 {
   497		crc := ^val
   498		for _, v := range b {
   499			crc = crctab[byte(crc>>24)^v] ^ (crc << 8)
   500		}
   501		return ^crc
   502	}
   503
View as plain text