001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.snappy;
020
021import java.io.IOException;
022import java.io.InputStream;
023
024import org.apache.commons.compress.compressors.CompressorInputStream;
025import org.apache.commons.compress.utils.IOUtils;
026
027/**
028 * CompressorInputStream for the raw Snappy format.
029 *
030 * <p>This implementation uses an internal buffer in order to handle
031 * the back-references that are at the heart of the LZ77 algorithm.
032 * The size of the buffer must be at least as big as the biggest
033 * offset used in the compressed stream.  The current version of the
034 * Snappy algorithm as defined by Google works on 32k blocks and
035 * doesn't contain offsets bigger than 32k which is the default block
036 * size used by this class.</p>
037 *
038 * @see <a href="http://code.google.com/p/snappy/source/browse/trunk/format_description.txt">Snappy compressed format description</a>
039 * @since 1.7
040 */
041public class SnappyCompressorInputStream extends CompressorInputStream {
042
043    /** Mask used to determine the type of "tag" is being processed */
044    private static final int TAG_MASK = 0x03;
045
046    /** Default block size */
047    public static final int DEFAULT_BLOCK_SIZE = 32768;
048
049    /** Buffer to write decompressed bytes to for back-references */
050    private final byte[] decompressBuf;
051
052    /** One behind the index of the last byte in the buffer that was written */
053    private int writeIndex;
054
055    /** Index of the next byte to be read. */
056    private int readIndex;
057
058    /** The actual block size specified */
059    private final int blockSize;
060
061    /** The underlying stream to read compressed data from */
062    private final InputStream in;
063
064    /** The size of the uncompressed data */
065    private final int size;
066
067    /** Number of uncompressed bytes still to be read. */
068    private int uncompressedBytesRemaining;
069
070    // used in no-arg read method
071    private final byte[] oneByte = new byte[1];
072
073    private boolean endReached = false;
074
075    /**
076     * Constructor using the default buffer size of 32k.
077     * 
078     * @param is
079     *            An InputStream to read compressed data from
080     * 
081     * @throws IOException
082     */
083    public SnappyCompressorInputStream(final InputStream is) throws IOException {
084        this(is, DEFAULT_BLOCK_SIZE);
085    }
086
087    /**
088     * Constructor using a configurable buffer size.
089     * 
090     * @param is
091     *            An InputStream to read compressed data from
092     * @param blockSize
093     *            The block size used in compression
094     * 
095     * @throws IOException
096     */
097    public SnappyCompressorInputStream(final InputStream is, final int blockSize)
098            throws IOException {
099        this.in = is;
100        this.blockSize = blockSize;
101        this.decompressBuf = new byte[blockSize * 3];
102        this.writeIndex = readIndex = 0;
103        uncompressedBytesRemaining = size = (int) readSize();
104    }
105
106    /** {@inheritDoc} */
107    @Override
108    public int read() throws IOException {
109        return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
110    }
111
112    /** {@inheritDoc} */
113    @Override
114    public void close() throws IOException {
115        in.close();
116    }
117
118    /** {@inheritDoc} */
119    @Override
120    public int available() {
121        return writeIndex - readIndex;
122    }
123
124    /**
125     * {@inheritDoc}
126     */
127    @Override
128    public int read(byte[] b, int off, int len) throws IOException {
129        if (endReached) {
130            return -1;
131        }
132        final int avail = available();
133        if (len > avail) {
134            fill(len - avail);
135        }
136
137        int readable = Math.min(len, available());
138        System.arraycopy(decompressBuf, readIndex, b, off, readable);
139        readIndex += readable;
140        if (readIndex > blockSize) {
141            slideBuffer();
142        }
143        return readable;
144    }
145
146    /**
147     * Try to fill the buffer with enough bytes to satisfy the current
148     * read request.
149     *
150     * @param len the number of uncompressed bytes to read
151     */
152    private void fill(int len) throws IOException {
153        if (uncompressedBytesRemaining == 0) {
154            endReached = true;
155        }
156        int readNow = Math.min(len, uncompressedBytesRemaining);
157
158        while (readNow > 0) {
159            final int b = readOneByte();
160            int length = 0;
161            long offset = 0;
162
163            switch (b & TAG_MASK) {
164
165            case 0x00:
166
167                length = readLiteralLength(b);
168
169                if (expandLiteral(length)) {
170                    return;
171                }
172                break;
173
174            case 0x01:
175
176                /*
177                 * These elements can encode lengths between [4..11] bytes and
178                 * offsets between [0..2047] bytes. (len-4) occupies three bits
179                 * and is stored in bits [2..4] of the tag byte. The offset
180                 * occupies 11 bits, of which the upper three are stored in the
181                 * upper three bits ([5..7]) of the tag byte, and the lower
182                 * eight are stored in a byte following the tag byte.
183                 */
184
185                length = 4 + ((b >> 2) & 0x07);
186                offset = (b & 0xE0) << 3;
187                offset |= readOneByte();
188
189                if (expandCopy(offset, length)) {
190                    return;
191                }
192                break;
193
194            case 0x02:
195
196                /*
197                 * These elements can encode lengths between [1..64] and offsets
198                 * from [0..65535]. (len-1) occupies six bits and is stored in
199                 * the upper six bits ([2..7]) of the tag byte. The offset is
200                 * stored as a little-endian 16-bit integer in the two bytes
201                 * following the tag byte.
202                 */
203
204                length = (b >> 2) + 1;
205
206                offset = readOneByte();
207                offset |= readOneByte() << 8;
208
209                if (expandCopy(offset, length)) {
210                    return;
211                }
212                break;
213
214            case 0x03:
215
216                /*
217                 * These are like the copies with 2-byte offsets (see previous
218                 * subsection), except that the offset is stored as a 32-bit
219                 * integer instead of a 16-bit integer (and thus will occupy
220                 * four bytes).
221                 */
222
223                length = (b >> 2) + 1;
224
225                offset = readOneByte();
226                offset |= readOneByte() << 8;
227                offset |= readOneByte() << 16;
228                offset |= ((long) readOneByte()) << 24;
229
230                if (expandCopy(offset, length)) {
231                    return;
232                }
233                break;
234            }
235
236            readNow -= length;
237            uncompressedBytesRemaining -= length;
238        }
239    }
240
241    /**
242     * Slide buffer.
243     *
244     * <p>Move all bytes of the buffer after the first block down to
245     * the beginning of the buffer.</p>
246     */
247    private void slideBuffer() {
248        System.arraycopy(decompressBuf, blockSize, decompressBuf, 0,
249                         blockSize * 2);
250        writeIndex -= blockSize;
251        readIndex -= blockSize;
252    }
253
254
255    /*
256     * For literals up to and including 60 bytes in length, the
257     * upper six bits of the tag byte contain (len-1). The literal
258     * follows immediately thereafter in the bytestream. - For
259     * longer literals, the (len-1) value is stored after the tag
260     * byte, little-endian. The upper six bits of the tag byte
261     * describe how many bytes are used for the length; 60, 61, 62
262     * or 63 for 1-4 bytes, respectively. The literal itself follows
263     * after the length.
264     */
265    private int readLiteralLength(int b) throws IOException {
266        int length;
267        switch (b >> 2) {
268        case 60:
269            length = readOneByte();
270            break;
271        case 61:
272            length = readOneByte();
273            length |= readOneByte() << 8;
274            break;
275        case 62:
276            length = readOneByte();
277            length |= readOneByte() << 8;
278            length |= readOneByte() << 16;
279            break;
280        case 63:
281            length = readOneByte();
282            length |= readOneByte() << 8;
283            length |= readOneByte() << 16;
284            length |= (((long) readOneByte()) << 24);
285            break;
286        default:
287            length = b >> 2;
288            break;
289        }
290
291        return length + 1;
292    }
293
294    /**
295     * Literals are uncompressed data stored directly in the byte stream.
296     * 
297     * @param length
298     *            The number of bytes to read from the underlying stream
299     * 
300     * @throws IOException
301     *             If the first byte cannot be read for any reason other than
302     *             end of file, or if the input stream has been closed, or if
303     *             some other I/O error occurs.
304     * @return True if the decompressed data should be flushed
305     */
306    private boolean expandLiteral(final int length) throws IOException {
307        int bytesRead = IOUtils.readFully(in, decompressBuf, writeIndex, length);
308        count(bytesRead);
309        if (length != bytesRead) {
310            throw new IOException("Premature end of stream");
311        }
312
313        writeIndex += length;
314        return writeIndex >= 2 * this.blockSize;
315    }
316
317    /**
318     * Copies are references back into previous decompressed data, telling the
319     * decompressor to reuse data it has previously decoded. They encode two
320     * values: The offset, saying how many bytes back from the current position
321     * to read, and the length, how many bytes to copy. Offsets of zero can be
322     * encoded, but are not legal; similarly, it is possible to encode
323     * backreferences that would go past the end of the block (offset > current
324     * decompressed position), which is also nonsensical and thus not allowed.
325     * 
326     * @param off
327     *            The offset from the backward from the end of expanded stream
328     * @param length
329     *            The number of bytes to copy
330     * 
331     * @throws IOException
332     *             An the offset expands past the front of the decompression
333     *             buffer
334     * @return True if the decompressed data should be flushed
335     */
336    private boolean expandCopy(final long off, int length) throws IOException {
337        if (off > blockSize) {
338            throw new IOException("Offset is larger than block size");
339        }
340        int offset = (int) off;
341
342        if (offset == 1) {
343            byte lastChar = decompressBuf[writeIndex - 1];
344            for (int i = 0; i < length; i++) {
345                decompressBuf[writeIndex++] = lastChar;
346            }
347        } else if (length < offset) {
348            System.arraycopy(decompressBuf, writeIndex - offset,
349                    decompressBuf, writeIndex, length);
350            writeIndex += length;
351        } else {
352            int fullRotations = length / offset;
353            int pad = length - (offset * fullRotations);
354
355            while (fullRotations-- != 0) {
356                System.arraycopy(decompressBuf, writeIndex - offset,
357                        decompressBuf, writeIndex, offset);
358                writeIndex += offset;
359            }
360
361            if (pad > 0) {
362                System.arraycopy(decompressBuf, writeIndex - offset,
363                        decompressBuf, writeIndex, pad);
364
365                writeIndex += pad;
366            }
367        }
368        return writeIndex >= 2 * this.blockSize;
369    }
370
371    /**
372     * This helper method reads the next byte of data from the input stream. The
373     * value byte is returned as an <code>int</code> in the range <code>0</code>
374     * to <code>255</code>. If no byte is available because the end of the
375     * stream has been reached, an Exception is thrown.
376     * 
377     * @return The next byte of data
378     * @throws IOException
379     *             EOF is reached or error reading the stream
380     */
381    private int readOneByte() throws IOException {
382        int b = in.read();
383        if (b == -1) {
384            throw new IOException("Premature end of stream");
385        }
386        count(1);
387        return b & 0xFF;
388    }
389
390    /**
391     * The stream starts with the uncompressed length (up to a maximum of 2^32 -
392     * 1), stored as a little-endian varint. Varints consist of a series of
393     * bytes, where the lower 7 bits are data and the upper bit is set iff there
394     * are more bytes to be read. In other words, an uncompressed length of 64
395     * would be stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE)
396     * would be stored as 0xFE 0xFF 0x7F.
397     * 
398     * @return The size of the uncompressed data
399     * 
400     * @throws IOException
401     *             Could not read a byte
402     */
403    private long readSize() throws IOException {
404        int index = 0;
405        long sz = 0;
406        int b = 0;
407
408        do {
409            b = readOneByte();
410            sz |= (b & 0x7f) << (index++ * 7);
411        } while (0 != (b & 0x80));
412        return sz;
413    }
414
415    /**
416     * Get the uncompressed size of the stream
417     * 
418     * @return the uncompressed size
419     */
420    public int getSize() {
421        return size;
422    }
423
424}