optionally use PAX headers when writing non-ASCII file names.  COMPRESS-183

git-svn-id: https://svn.apache.org/repos/asf/commons/proper/compress/trunk@1304709 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 5dcf8ca..86ef461 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -46,6 +46,17 @@
   <body>
     <release version="1.4" date="unreleased"
              description="Release 1.4">
+      <action issue="COMPRESS-183" type="fix" date="2012-03-24">
+        The tar package now allows the encoding of file names to be
+        specified and can optionally use PAX extension headers to
+        write non-ASCII file names.
+        The stream classes now write (or expect to read) archives that
+        use the platform's native encoding for file names.  Apache
+        Commons Compress 1.3 used to strip everything but the lower
+        eight bits of each character which effectively only worked for
+        ASCII and ISO-8859-1 file names.
+        This new default behavior is a breaking change.
+      </action> 
       <action issue="COMPRESS-184" type="fix" date="2012-03-23">
         TarArchiveInputStream failed to parse PAX headers that
         contained non-ASCII characters.
diff --git a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveOutputStream.java b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveOutputStream.java
index 9031f24..2c98d14 100644
--- a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveOutputStream.java
+++ b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveOutputStream.java
@@ -81,6 +81,10 @@
 
     private final ZipEncoding encoding;
 
+    private boolean addPaxHeadersForNonAsciiNames = false;
+    private static final ZipEncoding ASCII =
+        ZipEncodingHelper.getZipEncoding("ASCII");
+
     /**
      * Constructor for TarInputStream.
      * @param os the output stream to use
@@ -172,6 +176,13 @@
         this.bigNumberMode = bigNumberMode;
     }
 
+    /**
+     * Whether to add a PAX extension header for non-ASCII file names.
+     * @since Apache Commons Compress 1.4
+     */
+    public void setAddPaxHeadersForNonAsciiNames(boolean b) {
+        addPaxHeadersForNonAsciiNames = b;
+    }
 
     @Deprecated
     @Override
@@ -254,11 +265,14 @@
         }
         TarArchiveEntry entry = (TarArchiveEntry) archiveEntry;
         Map<String, String> paxHeaders = new HashMap<String, String>();
-        final byte[] nameBytes = encoding.encode(entry.getName()).array();
+        final String entryName = entry.getName();
+        final byte[] nameBytes = encoding.encode(entryName).array();
+        boolean paxHeaderContainsPath = false;
         if (nameBytes.length >= TarConstants.NAMELEN) {
 
             if (longFileMode == LONGFILE_POSIX) {
-                paxHeaders.put("path", entry.getName());
+                paxHeaders.put("path", entryName);
+                paxHeaderContainsPath = true;
             } else if (longFileMode == LONGFILE_GNU) {
                 // create a TarEntry for the LongLink, the contents
                 // of which are the entry's name
@@ -271,7 +285,7 @@
                 write(0); // NUL terminator
                 closeArchiveEntry();
             } else if (longFileMode != LONGFILE_TRUNCATE) {
-                throw new RuntimeException("file name '" + entry.getName()
+                throw new RuntimeException("file name '" + entryName
                                            + "' is too long ( > "
                                            + TarConstants.NAMELEN + " bytes)");
             }
@@ -283,8 +297,13 @@
             failForBigNumbers(entry);
         }
 
+        if (addPaxHeadersForNonAsciiNames && !paxHeaderContainsPath
+            && !ASCII.canEncode(entryName)) {
+            paxHeaders.put("path", entryName);
+        }
+
         if (paxHeaders.size() > 0) {
-            writePaxHeaders(entry.getName(), paxHeaders);
+            writePaxHeaders(entryName, paxHeaders);
         }
 
         entry.writeEntryHeader(recordBuf, encoding,
@@ -298,7 +317,7 @@
         } else {
             currSize = entry.getSize();
         }
-        currName = entry.getName();
+        currName = entryName;
         haveUnclosedEntry = true;
     }
 
@@ -426,7 +445,7 @@
      */
     void writePaxHeaders(String entryName,
                          Map<String, String> headers) throws IOException {
-        String name = "./PaxHeaders.X/" + entryName;
+        String name = "./PaxHeaders.X/" + stripTo7Bits(entryName);
         if (name.length() >= TarConstants.NAMELEN) {
             name = name.substring(0, TarConstants.NAMELEN - 1);
         }
@@ -461,6 +480,18 @@
         closeArchiveEntry();
     }
 
+    private String stripTo7Bits(String name) {
+        final int length = name.length();
+        StringBuffer result = new StringBuffer(length);
+        for (int i = 0; i < length; i++) {
+            char stripped = (char) (name.charAt(i) & 0x7F);
+            if (stripped != 0) { // would be read as Trailing null
+                result.append(stripped);
+            }
+        }
+        return result.toString();
+    }
+
     /**
      * Write an EOF (end of archive) record to the tar archive.
      * An EOF record consists of a record of all zeros.
diff --git a/src/test/java/org/apache/commons/compress/archivers/tar/TarArchiveOutputStreamTest.java b/src/test/java/org/apache/commons/compress/archivers/tar/TarArchiveOutputStreamTest.java
index deeb5c3..92e80c3 100644
--- a/src/test/java/org/apache/commons/compress/archivers/tar/TarArchiveOutputStreamTest.java
+++ b/src/test/java/org/apache/commons/compress/archivers/tar/TarArchiveOutputStreamTest.java
@@ -274,4 +274,27 @@
         }
     }
 
+    public void testWriteNonAsciiPathNamePaxHeader() throws Exception {
+        String n = "\u00e4";
+        TarArchiveEntry t = new TarArchiveEntry(n);
+        t.setSize(10 * 1024);
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        TarArchiveOutputStream tos = new TarArchiveOutputStream(bos);
+        tos.setAddPaxHeadersForNonAsciiNames(true);
+        tos.putArchiveEntry(t);
+        tos.write(new byte[10 * 1024]);
+        tos.closeArchiveEntry();
+        tos.close();
+        byte[] data = bos.toByteArray();
+        assertEquals("11 path=" + n + "\n",
+                     new String(data, 512, 11, "UTF-8"));
+        FileOutputStream fos = new FileOutputStream("/tmp/x");
+        fos.write(data);
+        fos.close();
+        TarArchiveInputStream tin =
+            new TarArchiveInputStream(new ByteArrayInputStream(data));
+        TarArchiveEntry e = tin.getNextTarEntry();
+        assertEquals(n, e.getName());
+    }
+
 }
\ No newline at end of file