infrastructure for non-ASCII encoding of file names in tar. COMPRESS-183
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/compress/trunk@1302170 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveEntry.java b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveEntry.java
index 23ef4aa..2ea8a22 100644
--- a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveEntry.java
+++ b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveEntry.java
@@ -19,11 +19,13 @@
package org.apache.commons.compress.archivers.tar;
import java.io.File;
+import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Date;
import java.util.Locale;
import org.apache.commons.compress.archivers.ArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipEncoding;
/**
* This class represents an entry in a Tar archive. It consists
@@ -177,7 +179,7 @@
/**
* Construct an empty entry and prepares the header values.
*/
- private TarArchiveEntry () {
+ private TarArchiveEntry() {
this.magic = MAGIC_POSIX;
this.version = VERSION_POSIX;
this.name = "";
@@ -307,8 +309,30 @@
* @throws IllegalArgumentException if any of the numeric fields have an invalid format
*/
public TarArchiveEntry(byte[] headerBuf) {
+ this(headerBuf, null);
+ }
+
+ /**
+ * Construct an entry from an archive's header bytes. File is set
+ * to null.
+ *
+ * @param headerBuf The header bytes from a tar archive entry.
+ * @param encoding encoding to use for file names
+ * @since Commons Compress 1.4
+ * @throws IllegalArgumentException if any of the numeric fields have an invalid format
+ */
+ public TarArchiveEntry(byte[] headerBuf, ZipEncoding encoding) {
this();
- parseTarHeader(headerBuf);
+ try {
+ parseTarHeader(headerBuf, encoding);
+ } catch (IOException ex) {
+ try {
+ parseTarHeader(headerBuf, encoding, true);
+ } catch (IOException ex2) {
+ // impossible
+ throw new RuntimeException(ex2);
+ }
+ }
}
/**
@@ -865,9 +889,39 @@
* @throws IllegalArgumentException if any of the numeric fields have an invalid format
*/
public void parseTarHeader(byte[] header) {
+ try {
+ parseTarHeader(header, TarUtils.DEFAULT_ENCODING);
+ } catch (IOException ex) {
+ try {
+ parseTarHeader(header, TarUtils.DEFAULT_ENCODING, true);
+ } catch (IOException ex2) {
+ // not really possible
+ throw new RuntimeException(ex2);
+ }
+ }
+ }
+
+ /**
+ * Parse an entry's header information from a header buffer.
+ *
+ * @param header The tar entry header buffer to get information from.
+ * @param encoding encoding to use for file names
+ * @since Commons Compress 1.4
+ * @throws IllegalArgumentException if any of the numeric fields
+ * have an invalid format
+ */
+ public void parseTarHeader(byte[] header, ZipEncoding encoding)
+ throws IOException {
+ parseTarHeader(header, encoding, false);
+ }
+
+ private void parseTarHeader(byte[] header, ZipEncoding encoding,
+ final boolean oldStyle)
+ throws IOException {
int offset = 0;
- name = TarUtils.parseName(header, offset, NAMELEN);
+ name = oldStyle ? TarUtils.parseName(header, offset, NAMELEN)
+ : TarUtils.parseName(header, offset, NAMELEN, encoding);
offset += NAMELEN;
mode = (int) TarUtils.parseOctalOrBinary(header, offset, MODELEN);
offset += MODELEN;
@@ -881,15 +935,18 @@
offset += MODTIMELEN;
offset += CHKSUMLEN;
linkFlag = header[offset++];
- linkName = TarUtils.parseName(header, offset, NAMELEN);
+ linkName = oldStyle ? TarUtils.parseName(header, offset, NAMELEN)
+ : TarUtils.parseName(header, offset, NAMELEN, encoding);
offset += NAMELEN;
magic = TarUtils.parseName(header, offset, MAGICLEN);
offset += MAGICLEN;
version = TarUtils.parseName(header, offset, VERSIONLEN);
offset += VERSIONLEN;
- userName = TarUtils.parseName(header, offset, UNAMELEN);
+ userName = oldStyle ? TarUtils.parseName(header, offset, UNAMELEN)
+ : TarUtils.parseName(header, offset, UNAMELEN, encoding);
offset += UNAMELEN;
- groupName = TarUtils.parseName(header, offset, GNAMELEN);
+ groupName = oldStyle ? TarUtils.parseName(header, offset, GNAMELEN)
+ : TarUtils.parseName(header, offset, GNAMELEN, encoding);
offset += GNAMELEN;
devMajor = (int) TarUtils.parseOctalOrBinary(header, offset, DEVLEN);
offset += DEVLEN;
@@ -913,7 +970,9 @@
}
case FORMAT_POSIX:
default: {
- String prefix = TarUtils.parseName(header, offset, PREFIXLEN);
+ String prefix = oldStyle
+ ? TarUtils.parseName(header, offset, PREFIXLEN)
+ : TarUtils.parseName(header, offset, PREFIXLEN, encoding);
// SunOS tar -E does not add / to directory names, so fix
// up to be consistent
if (isDirectory() && !name.endsWith("/")){
diff --git a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveInputStream.java b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveInputStream.java
index 8837b2b..b2b0ce9 100644
--- a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveInputStream.java
+++ b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveInputStream.java
@@ -33,6 +33,8 @@
import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.ArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipEncoding;
+import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
import org.apache.commons.compress.utils.ArchiveUtils;
/**
@@ -52,6 +54,7 @@
private byte[] readBuf;
protected final TarBuffer buffer;
private TarArchiveEntry currEntry;
+ private final ZipEncoding encoding;
/**
* Constructor for TarInputStream.
@@ -64,6 +67,16 @@
/**
* Constructor for TarInputStream.
* @param is the input stream to use
+ * @param encoding name of the encoding to use for file names
+ * @since Commons Compress 1.4
+ */
+ public TarArchiveInputStream(InputStream is, String encoding) {
+ this(is, TarBuffer.DEFAULT_BLKSIZE, TarBuffer.DEFAULT_RCDSIZE, encoding);
+ }
+
+ /**
+ * Constructor for TarInputStream.
+ * @param is the input stream to use
* @param blockSize the block size to use
*/
public TarArchiveInputStream(InputStream is, int blockSize) {
@@ -74,12 +87,38 @@
* Constructor for TarInputStream.
* @param is the input stream to use
* @param blockSize the block size to use
+ * @param encoding name of the encoding to use for file names
+ * @since Commons Compress 1.4
+ */
+ public TarArchiveInputStream(InputStream is, int blockSize,
+ String encoding) {
+ this(is, blockSize, TarBuffer.DEFAULT_RCDSIZE, encoding);
+ }
+
+ /**
+ * Constructor for TarInputStream.
+ * @param is the input stream to use
+ * @param blockSize the block size to use
* @param recordSize the record size to use
*/
public TarArchiveInputStream(InputStream is, int blockSize, int recordSize) {
+ this(is, blockSize, recordSize, null);
+ }
+
+ /**
+ * Constructor for TarInputStream.
+ * @param is the input stream to use
+ * @param blockSize the block size to use
+ * @param recordSize the record size to use
+ * @param encoding name of the encoding to use for file names
+ * @since Commons Compress 1.4
+ */
+ public TarArchiveInputStream(InputStream is, int blockSize, int recordSize,
+ String encoding) {
this.buffer = new TarBuffer(is, blockSize, recordSize);
this.readBuf = null;
this.hasHitEOF = false;
+ this.encoding = ZipEncodingHelper.getZipEncoding(encoding);
}
/**
@@ -196,7 +235,7 @@
}
try {
- currEntry = new TarArchiveEntry(headerBuf);
+ currEntry = new TarArchiveEntry(headerBuf, encoding);
} catch (IllegalArgumentException e) {
IOException ioe = new IOException("Error detected parsing the header");
ioe.initCause(e);
diff --git a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveOutputStream.java b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveOutputStream.java
index f53536b..2257988 100644
--- a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveOutputStream.java
+++ b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveOutputStream.java
@@ -26,6 +26,8 @@
import java.util.Map;
import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.ArchiveOutputStream;
+import org.apache.commons.compress.archivers.zip.ZipEncoding;
+import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
import org.apache.commons.compress.utils.ArchiveUtils;
import org.apache.commons.compress.utils.CountingOutputStream;
@@ -77,6 +79,8 @@
private final OutputStream out;
+ private final ZipEncoding encoding;
+
/**
* Constructor for TarInputStream.
* @param os the output stream to use
@@ -88,6 +92,16 @@
/**
* Constructor for TarInputStream.
* @param os the output stream to use
+ * @param encoding name of the encoding to use for file names
+ * @since Commons Compress 1.4
+ */
+ public TarArchiveOutputStream(OutputStream os, String encoding) {
+ this(os, TarBuffer.DEFAULT_BLKSIZE, TarBuffer.DEFAULT_RCDSIZE, encoding);
+ }
+
+ /**
+ * Constructor for TarInputStream.
+ * @param os the output stream to use
* @param blockSize the block size to use
*/
public TarArchiveOutputStream(OutputStream os, int blockSize) {
@@ -98,10 +112,36 @@
* Constructor for TarInputStream.
* @param os the output stream to use
* @param blockSize the block size to use
+ * @param encoding name of the encoding to use for file names
+ * @since Commons Compress 1.4
+ */
+ public TarArchiveOutputStream(OutputStream os, int blockSize,
+ String encoding) {
+ this(os, blockSize, TarBuffer.DEFAULT_RCDSIZE, encoding);
+ }
+
+ /**
+ * Constructor for TarInputStream.
+ * @param os the output stream to use
+ * @param blockSize the block size to use
* @param recordSize the record size to use
*/
public TarArchiveOutputStream(OutputStream os, int blockSize, int recordSize) {
+ this(os, blockSize, recordSize, null);
+ }
+
+ /**
+ * Constructor for TarInputStream.
+ * @param os the output stream to use
+ * @param blockSize the block size to use
+ * @param recordSize the record size to use
+ * @param encoding name of the encoding to use for file names
+ * @since Commons Compress 1.4
+ */
+ public TarArchiveOutputStream(OutputStream os, int blockSize,
+ int recordSize, String encoding) {
out = new CountingOutputStream(os);
+ this.encoding = ZipEncodingHelper.getZipEncoding(encoding);
this.buffer = new TarBuffer(out, blockSize, recordSize);
this.assemLen = 0;
diff --git a/src/main/java/org/apache/commons/compress/archivers/tar/TarUtils.java b/src/main/java/org/apache/commons/compress/archivers/tar/TarUtils.java
index 47f341b..b09dbd1 100644
--- a/src/main/java/org/apache/commons/compress/archivers/tar/TarUtils.java
+++ b/src/main/java/org/apache/commons/compress/archivers/tar/TarUtils.java
@@ -18,7 +18,11 @@
*/
package org.apache.commons.compress.archivers.tar;
+import java.io.IOException;
import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import org.apache.commons.compress.archivers.zip.ZipEncoding;
+import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
/**
* This class provides static utility methods to work with byte streams.
@@ -30,6 +34,9 @@
private static final int BYTE_MASK = 255;
+ static final ZipEncoding DEFAULT_ENCODING =
+ ZipEncodingHelper.getZipEncoding(null);
+
/** Private constructor to prevent instantiation of this utility class. */
private TarUtils(){
}
@@ -211,6 +218,19 @@
* @return The entry name.
*/
public static String parseName(byte[] buffer, final int offset, final int length) {
+ try {
+ return parseName(buffer, offset, length, DEFAULT_ENCODING);
+ } catch (IOException ex) {
+ return parseNameFallback(buffer, offset, length);
+ }
+ }
+
+ /*
+ * Used if default encoding cannot encode name and no explicit
+ * encoding has been specified.
+ */
+ private static String parseNameFallback(byte[] buffer, final int offset,
+ final int length) {
StringBuffer result = new StringBuffer(length);
int end = offset + length;
@@ -226,7 +246,38 @@
}
/**
- * Copy a name (StringBuffer) into a buffer.
+ * Parse an entry name from a buffer.
+ * Parsing stops when a NUL is found
+ * or the buffer length is reached.
+ *
+ * @param buffer The buffer from which to parse.
+ * @param offset The offset into the buffer from which to parse.
+ * @param length The maximum number of bytes to parse.
+ * @param encoding name of the encoding to use for file names
+ * @since Commons Compress 1.4
+ * @return The entry name.
+ */
+ public static String parseName(byte[] buffer, final int offset,
+ final int length,
+ final ZipEncoding encoding)
+ throws IOException {
+
+ int len = length;
+ for (; len > 0; len--) {
+ if (buffer[offset + len - 1] != 0) {
+ break;
+ }
+ }
+ if (len > 0) {
+ byte[] b = new byte[len];
+ System.arraycopy(buffer, offset, b, 0, len);
+ return encoding.decode(b);
+ }
+ return "";
+ }
+
+ /**
+ * Copy a name into a buffer.
* Copies characters from the name into the buffer
* starting at the specified offset.
* If the buffer is longer than the name, the buffer
@@ -241,6 +292,20 @@
* @return The updated offset, i.e. offset + length
*/
public static int formatNameBytes(String name, byte[] buf, final int offset, final int length) {
+ try {
+ return formatNameBytes(name, buf, offset, length, DEFAULT_ENCODING);
+ } catch (IOException ex) {
+ return formatNameBytesFallback(name, buf, offset, length);
+ }
+ }
+
+ /*
+ * Used if default encoding cannot format name and no explicit encoding
+ * has been specified.
+ */
+ private static int formatNameBytesFallback(String name, byte[] buf,
+ final int offset,
+ final int length) {
int i;
// copy until end of input or output is reached.
@@ -257,6 +322,43 @@
}
/**
+ * Copy a name (StringBuffer) into a buffer.
+ * Copies characters from the name into the buffer
+ * starting at the specified offset.
+ * If the buffer is longer than the name, the buffer
+ * is filled with trailing NULs.
+ * If the name is longer than the buffer,
+ * the output is truncated.
+ *
+ * @param name The header name from which to copy the characters.
+ * @param buf The buffer where the name is to be stored.
+ * @param offset The starting offset into the buffer
+ * @param length The maximum number of header bytes to copy.
+ * @param encoding name of the encoding to use for file names
+ * @since Commons Compress 1.4
+ * @return The updated offset, i.e. offset + length
+ */
+ public static int formatNameBytes(String name, byte[] buf, final int offset,
+ final int length,
+ final ZipEncoding encoding)
+ throws IOException {
+ int len = name.length();
+ ByteBuffer b = encoding.encode(name);
+ while (b.limit() > length && len > 0) {
+ b = encoding.encode(name.substring(0, --len));
+ }
+ final int limit = b.limit();
+ System.arraycopy(b.array(), b.arrayOffset(), buf, offset, limit);
+
+ // Pad any remaining output bytes with NUL
+ for (int i = limit; i < length; ++i) {
+ buf[offset + i] = 0;
+ }
+
+ return offset + length;
+ }
+
+ /**
* Fill buffer with unsigned octal number, padded with leading zeroes.
*
* @param value number to convert to octal - treated as unsigned
diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncoding.java b/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncoding.java
index b2579c7..65d2044 100644
--- a/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncoding.java
+++ b/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncoding.java
@@ -41,7 +41,7 @@
* <p>All implementations should implement this interface in a
* reentrant way.</p>
*/
-interface ZipEncoding {
+public interface ZipEncoding {
/**
* Check, whether the given string may be losslessly encoded using this
* encoding.
diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java b/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java
index c6b2f3b..de002fa 100644
--- a/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java
+++ b/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java
@@ -27,7 +27,7 @@
/**
* Static helper functions for robustly encoding filenames in zip files.
*/
-abstract class ZipEncodingHelper {
+public abstract class ZipEncodingHelper {
/**
* A class, which holds the high characters of a simple encoding
@@ -207,7 +207,7 @@
* the platform's default encoding.
* @return A zip encoding for the given encoding name.
*/
- static ZipEncoding getZipEncoding(String name) {
+ public static ZipEncoding getZipEncoding(String name) {
// fallback encoding is good enough for utf-8.
if (isUTF8(name)) {
diff --git a/src/test/java/org/apache/commons/compress/archivers/TarTestCase.java b/src/test/java/org/apache/commons/compress/archivers/TarTestCase.java
index dcc2706..7a96dae 100644
--- a/src/test/java/org/apache/commons/compress/archivers/TarTestCase.java
+++ b/src/test/java/org/apache/commons/compress/archivers/TarTestCase.java
@@ -117,7 +117,8 @@
public void testCOMPRESS114() throws Exception {
final File input = getFile("COMPRESS-114.tar");
final InputStream is = new FileInputStream(input);
- final ArchiveInputStream in = new ArchiveStreamFactory().createArchiveInputStream("tar", is);
+ final ArchiveInputStream in = new TarArchiveInputStream(is,
+ "iso-8859-1");
TarArchiveEntry entry = (TarArchiveEntry)in.getNextEntry();
assertEquals("3\u00b1\u00b1\u00b1F06\u00b1W2345\u00b1ZB\u00b1la\u00b1\u00b1\u00b1\u00b1\u00b1\u00b1\u00b1\u00b1BLA", entry.getName());
entry = (TarArchiveEntry)in.getNextEntry();