blob: b8887f2cfe17107c3ae0fbcee193ae046df59783 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.commons.compress.compressors.snappy;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.compress.compressors.CompressorInputStream;
import org.apache.commons.compress.utils.IOUtils;
/**
* CompressorInputStream for the raw Snappy format.
*
* <p>This implementation uses an internal buffer in order to handle
* the back-references that are at the heart of the LZ77 algorithm.
* The size of the buffer must be at least as big as the biggest
* offset used in the compressed stream. The current version of the
* Snappy algorithm as defined by Google works on 32k blocks and
* doesn't contain offsets bigger than 32k which is the default block
* size used by this class.</p>
*
* @see <a href="http://code.google.com/p/snappy/source/browse/trunk/format_description.txt">Snappy compressed format description</a>
* @since 1.7
*/
public class SnappyCompressorInputStream extends CompressorInputStream {
/** Mask used to determine the type of "tag" is being processed */
private static final int TAG_MASK = 0x03;
/** Default block size */
public static final int DEFAULT_BLOCK_SIZE = 32768;
/** Buffer to write decompressed bytes to for back-references */
private final byte[] decompressBuf;
/** One behind the index of the last byte in the buffer that was written */
private int writeIndex;
/** Index of the next byte to be read. */
private int readIndex;
/** The actual block size specified */
private final int blockSize;
/** The underlying stream to read compressed data from */
private final InputStream in;
/** The size of the uncompressed data */
private final int size;
/** Number of uncompressed bytes still to be read. */
private int uncompressedBytesRemaining;
// used in no-arg read method
private final byte[] oneByte = new byte[1];
private boolean endReached = false;
/**
* Constructor using the default buffer size of 32k.
*
* @param is
* An InputStream to read compressed data from
*
* @throws IOException
*/
public SnappyCompressorInputStream(final InputStream is) throws IOException {
this(is, DEFAULT_BLOCK_SIZE);
}
/**
* Constructor using a configurable buffer size.
*
* @param is
* An InputStream to read compressed data from
* @param blockSize
* The block size used in compression
*
* @throws IOException
*/
public SnappyCompressorInputStream(final InputStream is, final int blockSize)
throws IOException {
this.in = is;
this.blockSize = blockSize;
this.decompressBuf = new byte[blockSize * 3];
this.writeIndex = readIndex = 0;
uncompressedBytesRemaining = size = (int) readSize();
}
/** {@inheritDoc} */
@Override
public int read() throws IOException {
return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
}
/** {@inheritDoc} */
@Override
public void close() throws IOException {
in.close();
}
/** {@inheritDoc} */
@Override
public int available() {
return writeIndex - readIndex;
}
/**
* {@inheritDoc}
*/
@Override
public int read(byte[] b, int off, int len) throws IOException {
if (endReached) {
return -1;
}
final int avail = available();
if (len > avail) {
fill(len - avail);
}
int readable = Math.min(len, available());
if (readable == 0 && len > 0) {
return -1;
}
System.arraycopy(decompressBuf, readIndex, b, off, readable);
readIndex += readable;
if (readIndex > blockSize) {
slideBuffer();
}
return readable;
}
/**
* Try to fill the buffer with enough bytes to satisfy the current
* read request.
*
* @param len the number of uncompressed bytes to read
*/
private void fill(int len) throws IOException {
if (uncompressedBytesRemaining == 0) {
endReached = true;
}
int readNow = Math.min(len, uncompressedBytesRemaining);
while (readNow > 0) {
final int b = readOneByte();
int length = 0;
long offset = 0;
switch (b & TAG_MASK) {
case 0x00:
length = readLiteralLength(b);
if (expandLiteral(length)) {
return;
}
break;
case 0x01:
/*
* These elements can encode lengths between [4..11] bytes and
* offsets between [0..2047] bytes. (len-4) occupies three bits
* and is stored in bits [2..4] of the tag byte. The offset
* occupies 11 bits, of which the upper three are stored in the
* upper three bits ([5..7]) of the tag byte, and the lower
* eight are stored in a byte following the tag byte.
*/
length = 4 + ((b >> 2) & 0x07);
offset = (b & 0xE0) << 3;
offset |= readOneByte();
if (expandCopy(offset, length)) {
return;
}
break;
case 0x02:
/*
* These elements can encode lengths between [1..64] and offsets
* from [0..65535]. (len-1) occupies six bits and is stored in
* the upper six bits ([2..7]) of the tag byte. The offset is
* stored as a little-endian 16-bit integer in the two bytes
* following the tag byte.
*/
length = (b >> 2) + 1;
offset = readOneByte();
offset |= readOneByte() << 8;
if (expandCopy(offset, length)) {
return;
}
break;
case 0x03:
/*
* These are like the copies with 2-byte offsets (see previous
* subsection), except that the offset is stored as a 32-bit
* integer instead of a 16-bit integer (and thus will occupy
* four bytes).
*/
length = (b >> 2) + 1;
offset = readOneByte();
offset |= readOneByte() << 8;
offset |= readOneByte() << 16;
offset |= ((long) readOneByte()) << 24;
if (expandCopy(offset, length)) {
return;
}
break;
}
readNow -= length;
uncompressedBytesRemaining -= length;
}
}
/**
* Slide buffer.
*
* <p>Move all bytes of the buffer after the first block down to
* the beginning of the buffer.</p>
*/
private void slideBuffer() {
System.arraycopy(decompressBuf, blockSize, decompressBuf, 0,
blockSize * 2);
writeIndex -= blockSize;
readIndex -= blockSize;
}
/*
* For literals up to and including 60 bytes in length, the
* upper six bits of the tag byte contain (len-1). The literal
* follows immediately thereafter in the bytestream. - For
* longer literals, the (len-1) value is stored after the tag
* byte, little-endian. The upper six bits of the tag byte
* describe how many bytes are used for the length; 60, 61, 62
* or 63 for 1-4 bytes, respectively. The literal itself follows
* after the length.
*/
private int readLiteralLength(int b) throws IOException {
int length;
switch (b >> 2) {
case 60:
length = readOneByte();
break;
case 61:
length = readOneByte();
length |= readOneByte() << 8;
break;
case 62:
length = readOneByte();
length |= readOneByte() << 8;
length |= readOneByte() << 16;
break;
case 63:
length = readOneByte();
length |= readOneByte() << 8;
length |= readOneByte() << 16;
length |= (((long) readOneByte()) << 24);
break;
default:
length = b >> 2;
break;
}
return length + 1;
}
/**
* Literals are uncompressed data stored directly in the byte stream.
*
* @param length
* The number of bytes to read from the underlying stream
*
* @throws IOException
* If the first byte cannot be read for any reason other than
* end of file, or if the input stream has been closed, or if
* some other I/O error occurs.
* @return True if the decompressed data should be flushed
*/
private boolean expandLiteral(final int length) throws IOException {
int bytesRead = IOUtils.readFully(in, decompressBuf, writeIndex, length);
count(bytesRead);
if (length != bytesRead) {
throw new IOException("Premature end of stream");
}
writeIndex += length;
return writeIndex >= 2 * this.blockSize;
}
/**
* Copies are references back into previous decompressed data, telling the
* decompressor to reuse data it has previously decoded. They encode two
* values: The offset, saying how many bytes back from the current position
* to read, and the length, how many bytes to copy. Offsets of zero can be
* encoded, but are not legal; similarly, it is possible to encode
* backreferences that would go past the end of the block (offset > current
* decompressed position), which is also nonsensical and thus not allowed.
*
* @param off
* The offset from the backward from the end of expanded stream
* @param length
* The number of bytes to copy
*
* @throws IOException
* An the offset expands past the front of the decompression
* buffer
* @return True if the decompressed data should be flushed
*/
private boolean expandCopy(final long off, int length) throws IOException {
if (off > blockSize) {
throw new IOException("Offset is larger than block size");
}
int offset = (int) off;
if (offset == 1) {
byte lastChar = decompressBuf[writeIndex - 1];
for (int i = 0; i < length; i++) {
decompressBuf[writeIndex++] = lastChar;
}
} else if (length < offset) {
System.arraycopy(decompressBuf, writeIndex - offset,
decompressBuf, writeIndex, length);
writeIndex += length;
} else {
int fullRotations = length / offset;
int pad = length - (offset * fullRotations);
while (fullRotations-- != 0) {
System.arraycopy(decompressBuf, writeIndex - offset,
decompressBuf, writeIndex, offset);
writeIndex += offset;
}
if (pad > 0) {
System.arraycopy(decompressBuf, writeIndex - offset,
decompressBuf, writeIndex, pad);
writeIndex += pad;
}
}
return writeIndex >= 2 * this.blockSize;
}
/**
* This helper method reads the next byte of data from the input stream. The
* value byte is returned as an <code>int</code> in the range <code>0</code>
* to <code>255</code>. If no byte is available because the end of the
* stream has been reached, an Exception is thrown.
*
* @return The next byte of data
* @throws IOException
* EOF is reached or error reading the stream
*/
private int readOneByte() throws IOException {
int b = in.read();
if (b == -1) {
throw new IOException("Premature end of stream");
}
count(1);
return b & 0xFF;
}
/**
* The stream starts with the uncompressed length (up to a maximum of 2^32 -
* 1), stored as a little-endian varint. Varints consist of a series of
* bytes, where the lower 7 bits are data and the upper bit is set iff there
* are more bytes to be read. In other words, an uncompressed length of 64
* would be stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE)
* would be stored as 0xFE 0xFF 0x7F.
*
* @return The size of the uncompressed data
*
* @throws IOException
* Could not read a byte
*/
private long readSize() throws IOException {
int index = 0;
long sz = 0;
int b = 0;
do {
b = readOneByte();
sz |= (b & 0x7f) << (index++ * 7);
} while (0 != (b & 0x80));
return sz;
}
/**
* Get the uncompressed size of the stream
*
* @return the uncompressed size
*/
public int getSize() {
return size;
}
}