001/*
002 *  Licensed to the Apache Software Foundation (ASF) under one or more
003 *  contributor license agreements.  See the NOTICE file distributed with
004 *  this work for additional information regarding copyright ownership.
005 *  The ASF licenses this file to You under the Apache License, Version 2.0
006 *  (the "License"); you may not use this file except in compliance with
007 *  the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 *  Unless required by applicable law or agreed to in writing, software
012 *  distributed under the License is distributed on an "AS IS" BASIS,
013 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 *  See the License for the specific language governing permissions and
015 *  limitations under the License.
016 *
017 */
018
019/*
020 * This package is based on the work done by Timothy Gerard Endres
021 * (time@ice.com) to whom the Ant project is very grateful for his great code.
022 */
023
024package org.apache.commons.compress.archivers.tar;
025
026import java.io.ByteArrayOutputStream;
027import java.io.IOException;
028import java.io.InputStream;
029import java.util.HashMap;
030import java.util.Map;
031import java.util.Map.Entry;
032
033import org.apache.commons.compress.archivers.ArchiveEntry;
034import org.apache.commons.compress.archivers.ArchiveInputStream;
035import org.apache.commons.compress.archivers.zip.ZipEncoding;
036import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
037import org.apache.commons.compress.utils.ArchiveUtils;
038import org.apache.commons.compress.utils.CharsetNames;
039import org.apache.commons.compress.utils.IOUtils;
040
041/**
042 * The TarInputStream reads a UNIX tar archive as an InputStream.
043 * methods are provided to position at each successive entry in
044 * the archive, and the read each entry as a normal input stream
045 * using read().
046 * @NotThreadSafe
047 */
048public class TarArchiveInputStream extends ArchiveInputStream {
049
050    private static final int SMALL_BUFFER_SIZE = 256;
051
052    private final byte[] SMALL_BUF = new byte[SMALL_BUFFER_SIZE];
053
054    /** The size the TAR header */
055    private final int recordSize;
056
057    /** The size of a block */
058    private final int blockSize;
059
060    /** True if file has hit EOF */
061    private boolean hasHitEOF;
062
063    /** Size of the current entry */
064    private long entrySize;
065
066    /** How far into the entry the stream is at */
067    private long entryOffset;
068
069    /** An input stream to read from */
070    private final InputStream is;
071
072    /** The meta-data about the current entry */
073    private TarArchiveEntry currEntry;
074
075    /** The encoding of the file */
076    private final ZipEncoding zipEncoding;
077
078    // the provided encoding (for unit tests)
079    final String encoding;
080
081    /**
082     * Constructor for TarInputStream.
083     * @param is the input stream to use
084     */
085    public TarArchiveInputStream(InputStream is) {
086        this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE);
087    }
088
089    /**
090     * Constructor for TarInputStream.
091     * @param is the input stream to use
092     * @param encoding name of the encoding to use for file names
093     * @since 1.4
094     */
095    public TarArchiveInputStream(InputStream is, String encoding) {
096        this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE,
097             encoding);
098    }
099
100    /**
101     * Constructor for TarInputStream.
102     * @param is the input stream to use
103     * @param blockSize the block size to use
104     */
105    public TarArchiveInputStream(InputStream is, int blockSize) {
106        this(is, blockSize, TarConstants.DEFAULT_RCDSIZE);
107    }
108
109    /**
110     * Constructor for TarInputStream.
111     * @param is the input stream to use
112     * @param blockSize the block size to use
113     * @param encoding name of the encoding to use for file names
114     * @since 1.4
115     */
116    public TarArchiveInputStream(InputStream is, int blockSize,
117                                 String encoding) {
118        this(is, blockSize, TarConstants.DEFAULT_RCDSIZE, encoding);
119    }
120
121    /**
122     * Constructor for TarInputStream.
123     * @param is the input stream to use
124     * @param blockSize the block size to use
125     * @param recordSize the record size to use
126     */
127    public TarArchiveInputStream(InputStream is, int blockSize, int recordSize) {
128        this(is, blockSize, recordSize, null);      
129    }
130
131    /**
132     * Constructor for TarInputStream.
133     * @param is the input stream to use
134     * @param blockSize the block size to use
135     * @param recordSize the record size to use
136     * @param encoding name of the encoding to use for file names
137     * @since 1.4
138     */
139    public TarArchiveInputStream(InputStream is, int blockSize, int recordSize,
140                                 String encoding) {
141        this.is = is;
142        this.hasHitEOF = false;
143        this.encoding = encoding;
144        this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);
145        this.recordSize = recordSize;
146        this.blockSize = blockSize;
147    }
148
149    /**
150     * Closes this stream. Calls the TarBuffer's close() method.
151     * @throws IOException on error
152     */
153    @Override
154    public void close() throws IOException {
155        is.close();
156    }
157
158    /**
159     * Get the record size being used by this stream's buffer.
160     *
161     * @return The TarBuffer record size.
162     */
163    public int getRecordSize() {
164        return recordSize;
165    }
166
167    /**
168     * Get the available data that can be read from the current
169     * entry in the archive. This does not indicate how much data
170     * is left in the entire archive, only in the current entry.
171     * This value is determined from the entry's size header field
172     * and the amount of data already read from the current entry.
173     * Integer.MAX_VALUE is returned in case more than Integer.MAX_VALUE
174     * bytes are left in the current entry in the archive.
175     *
176     * @return The number of available bytes for the current entry.
177     * @throws IOException for signature
178     */
179    @Override
180    public int available() throws IOException {
181        if (entrySize - entryOffset > Integer.MAX_VALUE) {
182            return Integer.MAX_VALUE;
183        }
184        return (int) (entrySize - entryOffset);
185    }
186
187    
188    /**
189     * Skips over and discards <code>n</code> bytes of data from this input
190     * stream. The <code>skip</code> method may, for a variety of reasons, end
191     * up skipping over some smaller number of bytes, possibly <code>0</code>.
192     * This may result from any of a number of conditions; reaching end of file
193     * or end of entry before <code>n</code> bytes have been skipped; are only
194     * two possibilities. The actual number of bytes skipped is returned. If
195     * <code>n</code> is negative, no bytes are skipped.
196     * 
197     * 
198     * @param n
199     *            the number of bytes to be skipped.
200     * @return the actual number of bytes skipped.
201     * @exception IOException
202     *                if some other I/O error occurs.
203     */
204    @Override
205    public long skip(final long n) throws IOException {
206        if (n <= 0) {
207            return 0;
208        }
209
210        final long available = entrySize - entryOffset;
211        final long skipped = is.skip(Math.min(n, available)); 
212        count(skipped);
213        entryOffset += skipped;
214        return skipped;
215    }
216
217    /**
218     * Since we do not support marking just yet, we return false.
219     *
220     * @return False.
221     */
222    @Override
223    public boolean markSupported() {
224        return false;
225    }
226
227    /**
228     * Since we do not support marking just yet, we do nothing.
229     *
230     * @param markLimit The limit to mark.
231     */
232    @Override
233    public void mark(int markLimit) {
234    }
235
236    /**
237     * Since we do not support marking just yet, we do nothing.
238     */
239    @Override
240    public synchronized void reset() {
241    }
242
243    /**
244     * Get the next entry in this tar archive. This will skip
245     * over any remaining data in the current entry, if there
246     * is one, and place the input stream at the header of the
247     * next entry, and read the header and instantiate a new
248     * TarEntry from the header bytes and return that entry.
249     * If there are no more entries in the archive, null will
250     * be returned to indicate that the end of the archive has
251     * been reached.
252     *
253     * @return The next TarEntry in the archive, or null.
254     * @throws IOException on error
255     */
256    public TarArchiveEntry getNextTarEntry() throws IOException {
257        if (hasHitEOF) {
258            return null;
259        }
260
261        if (currEntry != null) {
262            /* Skip will only go to the end of the current entry */
263            IOUtils.skip(this, Long.MAX_VALUE);
264
265            /* skip to the end of the last record */
266            skipRecordPadding();
267        }
268
269        byte[] headerBuf = getRecord();
270
271        if (headerBuf == null) {
272            /* hit EOF */
273            currEntry = null;
274            return null;
275        }
276
277        try {
278            currEntry = new TarArchiveEntry(headerBuf, zipEncoding);
279        } catch (IllegalArgumentException e) {
280            IOException ioe = new IOException("Error detected parsing the header");
281            ioe.initCause(e);
282            throw ioe;
283        }
284
285        entryOffset = 0;
286        entrySize = currEntry.getSize();
287
288        if (currEntry.isGNULongLinkEntry()) {
289            byte[] longLinkData = getLongNameData();
290            if (longLinkData == null) {
291                // Bugzilla: 40334
292                // Malformed tar file - long link entry name not followed by
293                // entry
294                return null;
295            }
296            currEntry.setLinkName(zipEncoding.decode(longLinkData));
297        }
298
299        if (currEntry.isGNULongNameEntry()) {
300            byte[] longNameData = getLongNameData();
301            if (longNameData == null) {
302                // Bugzilla: 40334
303                // Malformed tar file - long entry name not followed by
304                // entry
305                return null;
306            }
307            currEntry.setName(zipEncoding.decode(longNameData));
308        }
309
310        if (currEntry.isPaxHeader()){ // Process Pax headers
311            paxHeaders();
312        }
313
314        if (currEntry.isGNUSparse()){ // Process sparse files
315            readGNUSparse();
316        }
317
318        // If the size of the next element in the archive has changed
319        // due to a new size being reported in the posix header
320        // information, we update entrySize here so that it contains
321        // the correct value.
322        entrySize = currEntry.getSize();
323
324        return currEntry;
325    }
326    
327    /**
328     * The last record block should be written at the full size, so skip any
329     * additional space used to fill a record after an entry
330     */
331    private void skipRecordPadding() throws IOException {
332        if (this.entrySize > 0 && this.entrySize % this.recordSize != 0) {
333            long numRecords = (this.entrySize / this.recordSize) + 1;
334            long padding = (numRecords * this.recordSize) - this.entrySize;
335            long skipped = IOUtils.skip(is, padding);
336            count(skipped);
337        }
338    }
339
340    /**
341     * Get the next entry in this tar archive as longname data.
342     *
343     * @return The next entry in the archive as longname data, or null.
344     * @throws IOException on error
345     */
346    protected byte[] getLongNameData() throws IOException {
347        // read in the name
348        ByteArrayOutputStream longName = new ByteArrayOutputStream();
349        int length = 0;
350        while ((length = read(SMALL_BUF)) >= 0) {
351            longName.write(SMALL_BUF, 0, length);
352        }
353        getNextEntry();
354        if (currEntry == null) {
355            // Bugzilla: 40334
356            // Malformed tar file - long entry name not followed by entry
357            return null;
358        }
359        byte[] longNameData = longName.toByteArray();
360        // remove trailing null terminator(s)
361        length = longNameData.length;
362        while (length > 0 && longNameData[length - 1] == 0) {
363            --length;
364        }
365        if (length != longNameData.length) {
366            byte[] l = new byte[length];
367            System.arraycopy(longNameData, 0, l, 0, length);
368            longNameData = l;
369        }
370        return longNameData;
371    }
372
373    /**
374     * Get the next record in this tar archive. This will skip
375     * over any remaining data in the current entry, if there
376     * is one, and place the input stream at the header of the
377     * next entry.
378     *
379     * <p>If there are no more entries in the archive, null will be
380     * returned to indicate that the end of the archive has been
381     * reached.  At the same time the {@code hasHitEOF} marker will be
382     * set to true.</p>
383     *
384     * @return The next header in the archive, or null.
385     * @throws IOException on error
386     */
387    private byte[] getRecord() throws IOException {
388        byte[] headerBuf = readRecord();
389        hasHitEOF = isEOFRecord(headerBuf);
390        if (hasHitEOF && headerBuf != null) {
391            tryToConsumeSecondEOFRecord();
392            consumeRemainderOfLastBlock();
393            headerBuf = null;
394        }
395        return headerBuf;
396    }
397
398    /**
399     * Determine if an archive record indicate End of Archive. End of
400     * archive is indicated by a record that consists entirely of null bytes.
401     *
402     * @param record The record data to check.
403     * @return true if the record data is an End of Archive
404     */
405    protected boolean isEOFRecord(byte[] record) {
406        return record == null || ArchiveUtils.isArrayZero(record, recordSize);
407    }
408    
409    /**
410     * Read a record from the input stream and return the data.
411     *
412     * @return The record data or null if EOF has been hit.
413     * @throws IOException on error
414     */
415    protected byte[] readRecord() throws IOException {
416
417        byte[] record = new byte[recordSize];
418
419        int readNow = IOUtils.readFully(is, record);
420        count(readNow);
421        if (readNow != recordSize) {
422            return null;
423        }
424
425        return record;
426    }
427
428    private void paxHeaders() throws IOException{
429        Map<String, String> headers = parsePaxHeaders(this);
430        getNextEntry(); // Get the actual file entry
431        applyPaxHeadersToCurrentEntry(headers);
432    }
433
434    Map<String, String> parsePaxHeaders(InputStream i) throws IOException {
435        Map<String, String> headers = new HashMap<String, String>();
436        // Format is "length keyword=value\n";
437        while(true){ // get length
438            int ch;
439            int len = 0;
440            int read = 0;
441            while((ch = i.read()) != -1) {
442                read++;
443                if (ch == ' '){ // End of length string
444                    // Get keyword
445                    ByteArrayOutputStream coll = new ByteArrayOutputStream();
446                    while((ch = i.read()) != -1) {
447                        read++;
448                        if (ch == '='){ // end of keyword
449                            String keyword = coll.toString(CharsetNames.UTF_8);
450                            // Get rest of entry
451                            final int restLen = len - read;
452                            byte[] rest = new byte[restLen];
453                            int got = IOUtils.readFully(i, rest);
454                            if (got != restLen) {
455                                throw new IOException("Failed to read "
456                                                      + "Paxheader. Expected "
457                                                      + restLen
458                                                      + " bytes, read "
459                                                      + got);
460                            }
461                            // Drop trailing NL
462                            String value = new String(rest, 0,
463                                                      restLen - 1, CharsetNames.UTF_8);
464                            headers.put(keyword, value);
465                            break;
466                        }
467                        coll.write((byte) ch);
468                    }
469                    break; // Processed single header
470                }
471                len *= 10;
472                len += ch - '0';
473            }
474            if (ch == -1){ // EOF
475                break;
476            }
477        }
478        return headers;
479    }
480
481    private void applyPaxHeadersToCurrentEntry(Map<String, String> headers) {
482        /*
483         * The following headers are defined for Pax.
484         * atime, ctime, charset: cannot use these without changing TarArchiveEntry fields
485         * mtime
486         * comment
487         * gid, gname
488         * linkpath
489         * size
490         * uid,uname
491         * SCHILY.devminor, SCHILY.devmajor: don't have setters/getters for those
492         */
493        for (Entry<String, String> ent : headers.entrySet()){
494            String key = ent.getKey();
495            String val = ent.getValue();
496            if ("path".equals(key)){
497                currEntry.setName(val);
498            } else if ("linkpath".equals(key)){
499                currEntry.setLinkName(val);
500            } else if ("gid".equals(key)){
501                currEntry.setGroupId(Long.parseLong(val));
502            } else if ("gname".equals(key)){
503                currEntry.setGroupName(val);
504            } else if ("uid".equals(key)){
505                currEntry.setUserId(Long.parseLong(val));
506            } else if ("uname".equals(key)){
507                currEntry.setUserName(val);
508            } else if ("size".equals(key)){
509                currEntry.setSize(Long.parseLong(val));
510            } else if ("mtime".equals(key)){
511                currEntry.setModTime((long) (Double.parseDouble(val) * 1000));
512            } else if ("SCHILY.devminor".equals(key)){
513                currEntry.setDevMinor(Integer.parseInt(val));
514            } else if ("SCHILY.devmajor".equals(key)){
515                currEntry.setDevMajor(Integer.parseInt(val));
516            }
517        }
518    }
519
520    /**
521     * Adds the sparse chunks from the current entry to the sparse chunks,
522     * including any additional sparse entries following the current entry.
523     *
524     * @throws IOException on error
525     *
526     * @todo Sparse files get not yet really processed.
527     */
528    private void readGNUSparse() throws IOException {
529        /* we do not really process sparse files yet
530        sparses = new ArrayList();
531        sparses.addAll(currEntry.getSparses());
532        */
533        if (currEntry.isExtended()) {
534            TarArchiveSparseEntry entry;
535            do {
536                byte[] headerBuf = getRecord();
537                if (headerBuf == null) {
538                    currEntry = null;
539                    break;
540                }
541                entry = new TarArchiveSparseEntry(headerBuf);
542                /* we do not really process sparse files yet
543                sparses.addAll(entry.getSparses());
544                */
545            } while (entry.isExtended());
546        }
547    }
548
549    /**
550     * Returns the next Archive Entry in this Stream.
551     *
552     * @return the next entry,
553     *         or {@code null} if there are no more entries
554     * @throws IOException if the next entry could not be read
555     */
556    @Override
557    public ArchiveEntry getNextEntry() throws IOException {
558        return getNextTarEntry();
559    }
560    
561    /**
562     * Tries to read the next record rewinding the stream if it is not a EOF record.
563     *
564     * <p>This is meant to protect against cases where a tar
565     * implementation has written only one EOF record when two are
566     * expected.  Actually this won't help since a non-conforming
567     * implementation likely won't fill full blocks consisting of - by
568     * default - ten records either so we probably have already read
569     * beyond the archive anyway.</p>
570     */
571    private void tryToConsumeSecondEOFRecord() throws IOException {
572        boolean shouldReset = true;
573        boolean marked = is.markSupported();
574        if (marked) {
575            is.mark(recordSize);
576        }
577        try {
578            shouldReset = !isEOFRecord(readRecord());
579        } finally {
580            if (shouldReset && marked) {
581                pushedBackBytes(recordSize);
582                is.reset();
583            }
584        }
585    }
586
587    /**
588     * Reads bytes from the current tar archive entry.
589     *
590     * This method is aware of the boundaries of the current
591     * entry in the archive and will deal with them as if they
592     * were this stream's start and EOF.
593     *
594     * @param buf The buffer into which to place bytes read.
595     * @param offset The offset at which to place bytes read.
596     * @param numToRead The number of bytes to read.
597     * @return The number of bytes read, or -1 at EOF.
598     * @throws IOException on error
599     */
600    @Override
601    public int read(byte[] buf, int offset, int numToRead) throws IOException {
602        int totalRead = 0;
603
604        if (hasHitEOF || entryOffset >= entrySize) {
605            return -1;
606        }
607
608        if (currEntry == null) {
609            throw new IllegalStateException("No current tar entry");
610        }
611
612        numToRead = Math.min(numToRead, available());
613        
614        totalRead = is.read(buf, offset, numToRead);
615        
616        if (totalRead == -1) {
617            if (numToRead > 0) {
618                throw new IOException("Truncated TAR archive");
619            }
620            hasHitEOF = true;
621        } else {
622            count(totalRead);
623            entryOffset += totalRead;
624        }
625
626        return totalRead;
627    }
628
629    /**
630     * Whether this class is able to read the given entry.
631     *
632     * <p>May return false if the current entry is a sparse file.</p>
633     */
634    @Override
635    public boolean canReadEntryData(ArchiveEntry ae) {
636        if (ae instanceof TarArchiveEntry) {
637            TarArchiveEntry te = (TarArchiveEntry) ae;
638            return !te.isGNUSparse();
639        }
640        return false;
641    }
642
643    /**
644     * Get the current TAR Archive Entry that this input stream is processing
645     * 
646     * @return The current Archive Entry
647     */
648    public TarArchiveEntry getCurrentEntry() {
649        return currEntry;
650    }
651
652    protected final void setCurrentEntry(TarArchiveEntry e) {
653        currEntry = e;
654    }
655
656    protected final boolean isAtEOF() {
657        return hasHitEOF;
658    }
659
660    protected final void setAtEOF(boolean b) {
661        hasHitEOF = b;
662    }
663
664    /**
665     * This method is invoked once the end of the archive is hit, it
666     * tries to consume the remaining bytes under the assumption that
667     * the tool creating this archive has padded the last block.
668     */
669    private void consumeRemainderOfLastBlock() throws IOException {
670        long bytesReadOfLastBlock = getBytesRead() % blockSize;
671        if (bytesReadOfLastBlock > 0) {
672            long skipped = IOUtils.skip(is, blockSize - bytesReadOfLastBlock);
673            count(skipped);
674        }
675    }
676
677    /**
678     * Checks if the signature matches what is expected for a tar file.
679     *
680     * @param signature
681     *            the bytes to check
682     * @param length
683     *            the number of bytes to check
684     * @return true, if this stream is a tar archive stream, false otherwise
685     */
686    public static boolean matches(byte[] signature, int length) {
687        if (length < TarConstants.VERSION_OFFSET+TarConstants.VERSIONLEN) {
688            return false;
689        }
690
691        if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX,
692                signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
693            &&
694            ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX,
695                signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
696                ){
697            return true;
698        }
699        if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU,
700                signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
701            &&
702            (
703             ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE,
704                signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
705            ||
706            ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO,
707                signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
708            )
709                ){
710            return true;
711        }
712        // COMPRESS-107 - recognise Ant tar files
713        if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT,
714                signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
715            &&
716            ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT,
717                signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
718                ){
719            return true;
720        }
721        return false;
722    }
723
724}