Add encoding support to DumpArchiveInputStream - related to COMPRESS-180 git-svn-id: https://svn.apache.org/repos/asf/commons/proper/compress/trunk@1512789 13f79535-47bb-0310-9956-ffa450edef68

commit: 5972cab660f9f3600ecff59dc94b02a08b0ef1f9 [log] [tgz]
author: Stefan Bodewig <bodewig@apache.org> Sat Aug 10 16:22:49 2013 +0000
committer: Stefan Bodewig <bodewig@apache.org> Sat Aug 10 16:22:49 2013 +0000
tree: dfee781fce4c9d5b5b15d2774523fb9ca6cf0df7
parent: 3f9fb61f14f12464f36a52642af52b761ea959be [diff]
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 84ed3b5..d91d24e 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml

@@ -95,6 +95,10 @@
         TarArchiveOutputStream now properly handles link names that
         are too long to fit into a traditional TAR header.
       </action>
+      <action type="add" date="2013-08-10">
+        DumpArchiveInputStream now supports an encoding parameter that
+        can be used to specify the default encoding of file names.
+      </action>
     </release>
     <release version="1.5" date="2013-03-14"
              description="Release 1.5">

diff --git a/src/main/java/org/apache/commons/compress/archivers/ArchiveStreamFactory.java b/src/main/java/org/apache/commons/compress/archivers/ArchiveStreamFactory.java
index ff8c745..8d7c876 100644
--- a/src/main/java/org/apache/commons/compress/archivers/ArchiveStreamFactory.java
+++ b/src/main/java/org/apache/commons/compress/archivers/ArchiveStreamFactory.java

@@ -116,7 +116,7 @@
     private String entryEncoding = null;
 
     /**
-     * Returns the encoding to use for arj, zip and tar files,
+     * Returns the encoding to use for arj, zip, dump and tar files,
      * or null for the default.
      *
      * @return entry encoding, or null
@@ -127,8 +127,8 @@
     }
 
     /**
-     * Sets the encoding to use for arj, zip and tar files.
-     * Use null for the default.
+     * Sets the encoding to use for arj, zip, dump and tar files.  Use
+     * null for the default.
      *
      * @since 1.5
      */
@@ -188,7 +188,11 @@
             return new CpioArchiveInputStream(in);
         }
         if (DUMP.equalsIgnoreCase(archiverName)) {
-            return new DumpArchiveInputStream(in);
+            if (entryEncoding != null) {
+                return new DumpArchiveInputStream(in, entryEncoding);
+            } else {
+                return new DumpArchiveInputStream(in);
+            }
         }
 
         throw new ArchiveException("Archiver: " + archiverName + " not found.");

diff --git a/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveInputStream.java b/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveInputStream.java
index e954a1b..131f977 100644
--- a/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveInputStream.java
+++ b/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveInputStream.java

@@ -20,6 +20,8 @@
 
 import org.apache.commons.compress.archivers.ArchiveException;
 import org.apache.commons.compress.archivers.ArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipEncoding;
+import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
 
 import java.io.EOFException;
 import java.io.IOException;
@@ -39,6 +41,11 @@
  * the archive, and the read each entry as a normal input stream
  * using read().
  *
+ * There doesn't seem to exist a hint on the encoding of string values
+ * in any piece documentation.  Given the main purpose of dump/restore
+ * is backing up a system it seems very likely the format uses the
+ * current default encoding of the system.
+ *
  * @NotThreadSafe
  */
 public class DumpArchiveInputStream extends ArchiveInputStream {
@@ -65,14 +72,34 @@
     private Queue<DumpArchiveEntry> queue;
 
     /**
-     * Constructor.
+     * The encoding to use for filenames and labels.
+     */
+    private final ZipEncoding encoding;
+
+    /**
+     * Constructor using the platform's default encoding for file
+     * names.
      *
      * @param is
      * @throws ArchiveException
      */
     public DumpArchiveInputStream(InputStream is) throws ArchiveException {
+        this(is, null);
+    }
+
+    /**
+     * Constructor.
+     *
+     * @param is
+     * @param encoding the encoding to use for file names, use null
+     * for the platform's default encoding
+     * @since 1.6
+     */
+    public DumpArchiveInputStream(InputStream is, String encoding)
+        throws ArchiveException {
         this.raw = new TapeInputStream(is);
         this.hasHitEOF = false;
+        this.encoding = ZipEncodingHelper.getZipEncoding(encoding);
 
         try {
             // read header, verify it's a dump archive.
@@ -83,7 +110,7 @@
             }
 
             // get summary information
-            summary = new DumpArchiveSummary(headerBytes);
+            summary = new DumpArchiveSummary(headerBytes, this.encoding);
 
             // reset buffer with actual block size.
             raw.resetBlockSize(summary.getNTRec(), summary.isCompressed());
@@ -324,7 +351,7 @@
 
                 byte type = blockBuffer[i + 6];
 
-                String name = new String(blockBuffer, i + 8, blockBuffer[i + 7]); // TODO default charset?
+                String name = DumpArchiveUtil.decode(encoding, blockBuffer, i + 8, blockBuffer[i + 7]);
 
                 if (".".equals(name) || "..".equals(name)) {
                     // do nothing...

diff --git a/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveSummary.java b/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveSummary.java
index fca377f..08b9e8f 100644
--- a/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveSummary.java
+++ b/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveSummary.java

@@ -18,8 +18,10 @@
  */
 package org.apache.commons.compress.archivers.dump;
 
+import java.io.IOException;
 import java.util.Date;
 
+import org.apache.commons.compress.archivers.zip.ZipEncoding;
 
 /**
  * This class represents identifying information about a Dump archive volume.
@@ -41,15 +43,15 @@
     private int firstrec;
     private int ntrec;
 
-    DumpArchiveSummary(byte[] buffer) {
+    DumpArchiveSummary(byte[] buffer, ZipEncoding encoding) throws IOException {
         dumpDate = 1000L * DumpArchiveUtil.convert32(buffer, 4);
         previousDumpDate = 1000L * DumpArchiveUtil.convert32(buffer, 8);
         volume = DumpArchiveUtil.convert32(buffer, 12);
-        label = new String(buffer, 676, DumpArchiveConstants.LBLSIZE).trim(); // TODO default charset?
+        label = DumpArchiveUtil.decode(encoding, buffer, 676, DumpArchiveConstants.LBLSIZE).trim();
         level = DumpArchiveUtil.convert32(buffer, 692);
-        filesys = new String(buffer, 696, DumpArchiveConstants.NAMELEN).trim(); // TODO default charset?
-        devname = new String(buffer, 760, DumpArchiveConstants.NAMELEN).trim(); // TODO default charset?
-        hostname = new String(buffer, 824, DumpArchiveConstants.NAMELEN).trim(); // TODO default charset?
+        filesys = DumpArchiveUtil.decode(encoding, buffer, 696, DumpArchiveConstants.NAMELEN).trim();
+        devname = DumpArchiveUtil.decode(encoding, buffer, 760, DumpArchiveConstants.NAMELEN).trim();
+        hostname = DumpArchiveUtil.decode(encoding, buffer, 824, DumpArchiveConstants.NAMELEN).trim();
         flags = DumpArchiveUtil.convert32(buffer, 888);
         firstrec = DumpArchiveUtil.convert32(buffer, 892);
         ntrec = DumpArchiveUtil.convert32(buffer, 896);

diff --git a/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveUtil.java b/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveUtil.java
index b813d8c..5b74944 100644
--- a/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveUtil.java
+++ b/src/main/java/org/apache/commons/compress/archivers/dump/DumpArchiveUtil.java

@@ -18,6 +18,8 @@
  */
 package org.apache.commons.compress.archivers.dump;
 
+import java.io.IOException;
+import org.apache.commons.compress.archivers.zip.ZipEncoding;
 
 /**
  * Various utilities for dump archives.
@@ -130,4 +132,14 @@
 
         return i;
     }
+
+    /**
+     * Decodes a byte array to a string.
+     */
+    static String decode(ZipEncoding encoding, byte[] b, int offset, int len)
+        throws IOException {
+        byte[] copy = new byte[len];
+        System.arraycopy(b, offset, copy, 0, len);
+        return encoding.decode(copy);
+    }
 }

diff --git a/src/main/java/org/apache/commons/compress/archivers/sevenz/SevenZFile.java b/src/main/java/org/apache/commons/compress/archivers/sevenz/SevenZFile.java
index d637ca8..16467e2 100644
--- a/src/main/java/org/apache/commons/compress/archivers/sevenz/SevenZFile.java
+++ b/src/main/java/org/apache/commons/compress/archivers/sevenz/SevenZFile.java

@@ -30,6 +30,7 @@
 
 import org.apache.commons.compress.utils.BoundedInputStream;
 import org.apache.commons.compress.utils.CRC32VerifyingInputStream;
+import org.apache.commons.compress.utils.CharsetNames;
 
 /**
  * Reads a 7z file, using RandomAccessFile under
@@ -670,7 +671,7 @@
                         int nextName = 0;
                         for (int i = 0; i < names.length; i += 2) {
                             if (names[i] == 0 && names[i+1] == 0) {
-                                files[nextFile++].setName(new String(names, nextName, i-nextName, "UTF-16LE"));
+                                files[nextFile++].setName(new String(names, nextName, i-nextName, CharsetNames.UTF_16LE));
                                 nextName = i + 2;
                             }
                         }

diff --git a/src/main/java/org/apache/commons/compress/archivers/tar/TarUtils.java b/src/main/java/org/apache/commons/compress/archivers/tar/TarUtils.java
index 5b5b0ad..24cd26a 100644
--- a/src/main/java/org/apache/commons/compress/archivers/tar/TarUtils.java
+++ b/src/main/java/org/apache/commons/compress/archivers/tar/TarUtils.java

@@ -239,7 +239,15 @@
     // Helper method to generate the exception message
     private static String exceptionMessage(byte[] buffer, final int offset,
             final int length, int current, final byte currentByte) {
-        String string = new String(buffer, offset, length); // TODO default charset?
+        // default charset is good enough for an exception message,
+        //
+        // the alternative was to modify parseOctal and
+        // parseOctalOrBinary to receive the ZipEncoding of the
+        // archive (deprecating the existing public methods, of
+        // course) and dealing with the fact that ZipEncoding#decode
+        // can throw an IOException which parseOctal* doesn't declare
+        String string = new String(buffer, offset, length);
+
         string=string.replaceAll("\0", "{NUL}"); // Replace NULs to allow string to be printed
         final String s = "Invalid byte "+currentByte+" at offset "+(current-offset)+" in '"+string+"' len="+length;
         return s;
commit	5972cab660f9f3600ecff59dc94b02a08b0ef1f9	[log] [tgz]
author	Stefan Bodewig <bodewig@apache.org>	Sat Aug 10 16:22:49 2013 +0000
committer	Stefan Bodewig <bodewig@apache.org>	Sat Aug 10 16:22:49 2013 +0000
tree	dfee781fce4c9d5b5b15d2774523fb9ca6cf0df7
parent	3f9fb61f14f12464f36a52642af52b761ea959be [diff]