Add encoding support to DumpArchiveInputStream - related to COMPRESS-180
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/compress/trunk@1512789 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 84ed3b5..d91d24e 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -95,6 +95,10 @@
TarArchiveOutputStream now properly handles link names that
are too long to fit into a traditional TAR header.
</action>
+ <action type="add" date="2013-08-10">
+ DumpArchiveInputStream now supports an encoding parameter that
+ can be used to specify the default encoding of file names.
+ </action>
</release>
<release version="1.5" date="2013-03-14"
description="Release 1.5">
diff --git a/src/main/java/org/apache/commons/compress/archivers/ArchiveStreamFactory.java b/src/main/java/org/apache/commons/compress/archivers/ArchiveStreamFactory.java
index ff8c745..8d7c876 100644
--- a/src/main/java/org/apache/commons/compress/archivers/ArchiveStreamFactory.java
+++ b/src/main/java/org/apache/commons/compress/archivers/ArchiveStreamFactory.java
@@ -116,7 +116,7 @@
private String entryEncoding = null;
/**
- * Returns the encoding to use for arj, zip and tar files,
+ * Returns the encoding to use for arj, zip, dump and tar files,
* or null for the default.
*
* @return entry encoding, or null
@@ -127,8 +127,8 @@
}
/**
- * Sets the encoding to use for arj, zip and tar files.
- * Use null for the default.
+ * Sets the encoding to use for arj, zip, dump and tar files. Use
+ * null for the default.
*
* @since 1.5
*/
@@ -188,7 +188,11 @@
return new CpioArchiveInputStream(in);
}
if (DUMP.equalsIgnoreCase(archiverName)) {
- return new DumpArchiveInputStream(in);
+ if (entryEncoding != null) {
+ return new DumpArchiveInputStream(in, entryEncoding);
+ } else {
+ return new DumpArchiveInputStream(in);
+ }
}
throw new ArchiveException("Archiver: " + archiverName + " not found.");
diff --git a/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveInputStream.java b/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveInputStream.java
index e954a1b..131f977 100644
--- a/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveInputStream.java
+++ b/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveInputStream.java
@@ -20,6 +20,8 @@
import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipEncoding;
+import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
import java.io.EOFException;
import java.io.IOException;
@@ -39,6 +41,11 @@
* the archive, and the read each entry as a normal input stream
* using read().
*
+ * There doesn't seem to exist a hint on the encoding of string values
+ * in any piece documentation. Given the main purpose of dump/restore
+ * is backing up a system it seems very likely the format uses the
+ * current default encoding of the system.
+ *
* @NotThreadSafe
*/
public class DumpArchiveInputStream extends ArchiveInputStream {
@@ -65,14 +72,34 @@
private Queue<DumpArchiveEntry> queue;
/**
- * Constructor.
+ * The encoding to use for filenames and labels.
+ */
+ private final ZipEncoding encoding;
+
+ /**
+ * Constructor using the platform's default encoding for file
+ * names.
*
* @param is
* @throws ArchiveException
*/
public DumpArchiveInputStream(InputStream is) throws ArchiveException {
+ this(is, null);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param is
+ * @param encoding the encoding to use for file names, use null
+ * for the platform's default encoding
+ * @since 1.6
+ */
+ public DumpArchiveInputStream(InputStream is, String encoding)
+ throws ArchiveException {
this.raw = new TapeInputStream(is);
this.hasHitEOF = false;
+ this.encoding = ZipEncodingHelper.getZipEncoding(encoding);
try {
// read header, verify it's a dump archive.
@@ -83,7 +110,7 @@
}
// get summary information
- summary = new DumpArchiveSummary(headerBytes);
+ summary = new DumpArchiveSummary(headerBytes, this.encoding);
// reset buffer with actual block size.
raw.resetBlockSize(summary.getNTRec(), summary.isCompressed());
@@ -324,7 +351,7 @@
byte type = blockBuffer[i + 6];
- String name = new String(blockBuffer, i + 8, blockBuffer[i + 7]); // TODO default charset?
+ String name = DumpArchiveUtil.decode(encoding, blockBuffer, i + 8, blockBuffer[i + 7]);
if (".".equals(name) || "..".equals(name)) {
// do nothing...
diff --git a/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveSummary.java b/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveSummary.java
index fca377f..08b9e8f 100644
--- a/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveSummary.java
+++ b/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveSummary.java
@@ -18,8 +18,10 @@
*/
package org.apache.commons.compress.archivers.dump;
+import java.io.IOException;
import java.util.Date;
+import org.apache.commons.compress.archivers.zip.ZipEncoding;
/**
* This class represents identifying information about a Dump archive volume.
@@ -41,15 +43,15 @@
private int firstrec;
private int ntrec;
- DumpArchiveSummary(byte[] buffer) {
+ DumpArchiveSummary(byte[] buffer, ZipEncoding encoding) throws IOException {
dumpDate = 1000L * DumpArchiveUtil.convert32(buffer, 4);
previousDumpDate = 1000L * DumpArchiveUtil.convert32(buffer, 8);
volume = DumpArchiveUtil.convert32(buffer, 12);
- label = new String(buffer, 676, DumpArchiveConstants.LBLSIZE).trim(); // TODO default charset?
+ label = DumpArchiveUtil.decode(encoding, buffer, 676, DumpArchiveConstants.LBLSIZE).trim();
level = DumpArchiveUtil.convert32(buffer, 692);
- filesys = new String(buffer, 696, DumpArchiveConstants.NAMELEN).trim(); // TODO default charset?
- devname = new String(buffer, 760, DumpArchiveConstants.NAMELEN).trim(); // TODO default charset?
- hostname = new String(buffer, 824, DumpArchiveConstants.NAMELEN).trim(); // TODO default charset?
+ filesys = DumpArchiveUtil.decode(encoding, buffer, 696, DumpArchiveConstants.NAMELEN).trim();
+ devname = DumpArchiveUtil.decode(encoding, buffer, 760, DumpArchiveConstants.NAMELEN).trim();
+ hostname = DumpArchiveUtil.decode(encoding, buffer, 824, DumpArchiveConstants.NAMELEN).trim();
flags = DumpArchiveUtil.convert32(buffer, 888);
firstrec = DumpArchiveUtil.convert32(buffer, 892);
ntrec = DumpArchiveUtil.convert32(buffer, 896);
diff --git a/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveUtil.java b/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveUtil.java
index b813d8c..5b74944 100644
--- a/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveUtil.java
+++ b/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveUtil.java
@@ -18,6 +18,8 @@
*/
package org.apache.commons.compress.archivers.dump;
+import java.io.IOException;
+import org.apache.commons.compress.archivers.zip.ZipEncoding;
/**
* Various utilities for dump archives.
@@ -130,4 +132,14 @@
return i;
}
+
+ /**
+ * Decodes a byte array to a string.
+ */
+ static String decode(ZipEncoding encoding, byte[] b, int offset, int len)
+ throws IOException {
+ byte[] copy = new byte[len];
+ System.arraycopy(b, offset, copy, 0, len);
+ return encoding.decode(copy);
+ }
}
diff --git a/src/main/java/org/apache/commons/compress/archivers/sevenz/SevenZFile.java b/src/main/java/org/apache/commons/compress/archivers/sevenz/SevenZFile.java
index d637ca8..16467e2 100644
--- a/src/main/java/org/apache/commons/compress/archivers/sevenz/SevenZFile.java
+++ b/src/main/java/org/apache/commons/compress/archivers/sevenz/SevenZFile.java
@@ -30,6 +30,7 @@
import org.apache.commons.compress.utils.BoundedInputStream;
import org.apache.commons.compress.utils.CRC32VerifyingInputStream;
+import org.apache.commons.compress.utils.CharsetNames;
/**
* Reads a 7z file, using RandomAccessFile under
@@ -670,7 +671,7 @@
int nextName = 0;
for (int i = 0; i < names.length; i += 2) {
if (names[i] == 0 && names[i+1] == 0) {
- files[nextFile++].setName(new String(names, nextName, i-nextName, "UTF-16LE"));
+ files[nextFile++].setName(new String(names, nextName, i-nextName, CharsetNames.UTF_16LE));
nextName = i + 2;
}
}
diff --git a/src/main/java/org/apache/commons/compress/archivers/tar/TarUtils.java b/src/main/java/org/apache/commons/compress/archivers/tar/TarUtils.java
index 5b5b0ad..24cd26a 100644
--- a/src/main/java/org/apache/commons/compress/archivers/tar/TarUtils.java
+++ b/src/main/java/org/apache/commons/compress/archivers/tar/TarUtils.java
@@ -239,7 +239,15 @@
// Helper method to generate the exception message
private static String exceptionMessage(byte[] buffer, final int offset,
final int length, int current, final byte currentByte) {
- String string = new String(buffer, offset, length); // TODO default charset?
+ // default charset is good enough for an exception message,
+ //
+ // the alternative was to modify parseOctal and
+ // parseOctalOrBinary to receive the ZipEncoding of the
+ // archive (deprecating the existing public methods, of
+ // course) and dealing with the fact that ZipEncoding#decode
+ // can throw an IOException which parseOctal* doesn't declare
+ String string = new String(buffer, offset, length);
+
string=string.replaceAll("\0", "{NUL}"); // Replace NULs to allow string to be printed
final String s = "Invalid byte "+currentByte+" at offset "+(current-offset)+" in '"+string+"' len="+length;
return s;