Added errors argument to TarFile class that allows the user to
specify an error handling scheme for character conversion. Additional
scheme "utf-8" in read mode. Unicode input filenames are now
supported by design. The values of the pax_headers dictionary are now
limited to unicode objects.

Fixed: The prefix field is no longer used in PAX_FORMAT (in
conformance with POSIX).
Fixed: In read mode use a possible pax header size field.
Fixed: Strip trailing slashes from pax header name values.
Fixed: Give values in user-specified pax_headers precedence when
writing.

Added unicode tests. Added pax/regtype4 member to testtar.tar all
possible number fields in a pax header.

Added two chapters to the documentation about the different formats
tarfile.py supports and how unicode issues are handled.
diff --git a/Lib/tarfile.py b/Lib/tarfile.py
index 4f4a1d9..107041e 100644
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -125,6 +125,17 @@
 PAX_FIELDS = ("path", "linkpath", "size", "mtime",
               "uid", "gid", "uname", "gname")
 
+# Fields in a pax header that are numbers, all other fields
+# are treated as strings.
+PAX_NUMBER_FIELDS = {
+    "atime": float,
+    "ctime": float,
+    "mtime": float,
+    "uid": int,
+    "gid": int,
+    "size": int
+}
+
 #---------------------------------------------------------
 # Bits used in the mode field, values in octal.
 #---------------------------------------------------------
@@ -154,7 +165,7 @@
 #---------------------------------------------------------
 ENCODING = sys.getfilesystemencoding()
 if ENCODING is None:
-    ENCODING = "ascii"
+    ENCODING = sys.getdefaultencoding()
 
 #---------------------------------------------------------
 # Some useful functions
@@ -218,6 +229,26 @@
         s = chr(0200) + s
     return s
 
+def uts(s, encoding, errors):
+    """Convert a unicode object to a string.
+    """
+    if errors == "utf-8":
+        # An extra error handler similar to the -o invalid=UTF-8 option
+        # in POSIX.1-2001. Replace untranslatable characters with their
+        # UTF-8 representation.
+        try:
+            return s.encode(encoding, "strict")
+        except UnicodeEncodeError:
+            x = []
+            for c in s:
+                try:
+                    x.append(c.encode(encoding, "strict"))
+                except UnicodeEncodeError:
+                    x.append(c.encode("utf8"))
+            return "".join(x)
+    else:
+        return s.encode(encoding, errors)
+
 def calc_chksums(buf):
     """Calculate the checksum for a member's header by summing up all
        characters except for the chksum field which is treated as if
@@ -922,7 +953,7 @@
     def __repr__(self):
         return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
 
-    def get_info(self):
+    def get_info(self, encoding, errors):
         """Return the TarInfo's attributes as a dictionary.
         """
         info = {
@@ -944,24 +975,29 @@
         if info["type"] == DIRTYPE and not info["name"].endswith("/"):
             info["name"] += "/"
 
+        for key in ("name", "linkname", "uname", "gname"):
+            if type(info[key]) is unicode:
+                info[key] = info[key].encode(encoding, errors)
+
         return info
 
-    def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING):
+    def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="strict"):
         """Return a tar header as a string of 512 byte blocks.
         """
+        info = self.get_info(encoding, errors)
+
         if format == USTAR_FORMAT:
-            return self.create_ustar_header()
+            return self.create_ustar_header(info)
         elif format == GNU_FORMAT:
-            return self.create_gnu_header()
+            return self.create_gnu_header(info)
         elif format == PAX_FORMAT:
-            return self.create_pax_header(encoding)
+            return self.create_pax_header(info, encoding, errors)
         else:
             raise ValueError("invalid format")
 
-    def create_ustar_header(self):
+    def create_ustar_header(self, info):
         """Return the object as a ustar header block.
         """
-        info = self.get_info()
         info["magic"] = POSIX_MAGIC
 
         if len(info["linkname"]) > LENGTH_LINK:
@@ -972,10 +1008,9 @@
 
         return self._create_header(info, USTAR_FORMAT)
 
-    def create_gnu_header(self):
+    def create_gnu_header(self, info):
         """Return the object as a GNU header block sequence.
         """
-        info = self.get_info()
         info["magic"] = GNU_MAGIC
 
         buf = ""
@@ -987,12 +1022,11 @@
 
         return buf + self._create_header(info, GNU_FORMAT)
 
-    def create_pax_header(self, encoding):
+    def create_pax_header(self, info, encoding, errors):
         """Return the object as a ustar header block. If it cannot be
            represented this way, prepend a pax extended header sequence
            with supplement information.
         """
-        info = self.get_info()
         info["magic"] = POSIX_MAGIC
         pax_headers = self.pax_headers.copy()
 
@@ -1002,7 +1036,11 @@
                 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
                 ("uname", "uname", 32), ("gname", "gname", 32)):
 
-            val = info[name].decode(encoding)
+            if hname in pax_headers:
+                # The pax header has priority.
+                continue
+
+            val = info[name].decode(encoding, errors)
 
             # Try to encode the string as ASCII.
             try:
@@ -1011,27 +1049,23 @@
                 pax_headers[hname] = val
                 continue
 
-            if len(val) > length:
-                if name == "name":
-                    # Try to squeeze a longname in the prefix and name fields as in
-                    # ustar format.
-                    try:
-                        info["prefix"], info["name"] = self._posix_split_name(info["name"])
-                    except ValueError:
-                        pax_headers[hname] = val
-                    else:
-                        continue
-                else:
-                    pax_headers[hname] = val
+            if len(info[name]) > length:
+                pax_headers[hname] = val
 
         # Test number fields for values that exceed the field limit or values
         # that like to be stored as float.
         for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
+            if name in pax_headers:
+                # The pax header has priority. Avoid overflow.
+                info[name] = 0
+                continue
+
             val = info[name]
             if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
                 pax_headers[name] = unicode(val)
                 info[name] = 0
 
+        # Create a pax extended header if necessary.
         if pax_headers:
             buf = self._create_pax_generic_header(pax_headers)
         else:
@@ -1040,26 +1074,10 @@
         return buf + self._create_header(info, USTAR_FORMAT)
 
     @classmethod
-    def create_pax_global_header(cls, pax_headers, encoding):
+    def create_pax_global_header(cls, pax_headers):
         """Return the object as a pax global header block sequence.
         """
-        new_headers = {}
-        for key, val in pax_headers.iteritems():
-            key = cls._to_unicode(key, encoding)
-            val = cls._to_unicode(val, encoding)
-            new_headers[key] = val
-        return cls._create_pax_generic_header(new_headers, type=XGLTYPE)
-
-    @staticmethod
-    def _to_unicode(value, encoding):
-        if isinstance(value, unicode):
-            return value
-        elif isinstance(value, (int, long, float)):
-            return unicode(value)
-        elif isinstance(value, str):
-            return unicode(value, encoding)
-        else:
-            raise ValueError("unable to convert to unicode: %r" % value)
+        return cls._create_pax_generic_header(pax_headers, type=XGLTYPE)
 
     def _posix_split_name(self, name):
         """Split a name longer than 100 chars into a prefix
@@ -1091,9 +1109,9 @@
             "        ", # checksum field
             info.get("type", REGTYPE),
             stn(info.get("linkname", ""), 100),
-            stn(info.get("magic", ""), 8),
-            stn(info.get("uname", ""), 32),
-            stn(info.get("gname", ""), 32),
+            stn(info.get("magic", POSIX_MAGIC), 8),
+            stn(info.get("uname", "root"), 32),
+            stn(info.get("gname", "root"), 32),
             itn(info.get("devmajor", 0), 8, format),
             itn(info.get("devminor", 0), 8, format),
             stn(info.get("prefix", ""), 155)
@@ -1254,12 +1272,9 @@
             offset += self._block(self.size)
         tarfile.offset = offset
 
-        # Patch the TarInfo object with saved extended
+        # Patch the TarInfo object with saved global
         # header information.
-        for keyword, value in tarfile.pax_headers.iteritems():
-            if keyword in PAX_FIELDS:
-                setattr(self, keyword, value)
-            self.pax_headers[keyword] = value
+        self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
 
         return self
 
@@ -1270,18 +1285,17 @@
         buf = tarfile.fileobj.read(self._block(self.size))
 
         # Fetch the next header and process it.
-        b = tarfile.fileobj.read(BLOCKSIZE)
-        t = self.frombuf(b)
-        t.offset = self.offset
-        next = t._proc_member(tarfile)
+        next = self.fromtarfile(tarfile)
+        if next is None:
+            raise HeaderError("missing subsequent header")
 
         # Patch the TarInfo object from the next header with
         # the longname information.
         next.offset = self.offset
         if self.type == GNUTYPE_LONGNAME:
-            next.name = buf.rstrip(NUL)
+            next.name = nts(buf)
         elif self.type == GNUTYPE_LONGLINK:
-            next.linkname = buf.rstrip(NUL)
+            next.linkname = nts(buf)
 
         return next
 
@@ -1356,21 +1370,10 @@
         else:
             pax_headers = tarfile.pax_headers.copy()
 
-        # Fields in POSIX.1-2001 that are numbers, all other fields
-        # are treated as UTF-8 strings.
-        type_mapping = {
-            "atime":        float,
-            "ctime":        float,
-            "mtime":        float,
-            "uid":          int,
-            "gid":          int,
-            "size":         int
-        }
-
         # Parse pax header information. A record looks like that:
         # "%d %s=%s\n" % (length, keyword, value). length is the size
         # of the complete record including the length field itself and
-        # the newline.
+        # the newline. keyword and value are both UTF-8 encoded strings.
         regex = re.compile(r"(\d+) ([^=]+)=", re.U)
         pos = 0
         while True:
@@ -1383,35 +1386,55 @@
             value = buf[match.end(2) + 1:match.start(1) + length - 1]
 
             keyword = keyword.decode("utf8")
-            keyword = keyword.encode(tarfile.encoding)
-
             value = value.decode("utf8")
-            if keyword in type_mapping:
-                try:
-                    value = type_mapping[keyword](value)
-                except ValueError:
-                    value = 0
-            else:
-                value = value.encode(tarfile.encoding)
 
             pax_headers[keyword] = value
             pos += length
 
-        # Fetch the next header that will be patched with the
-        # supplement information from the pax header (extended
-        # only).
-        t = self.fromtarfile(tarfile)
+        # Fetch the next header.
+        next = self.fromtarfile(tarfile)
 
-        if self.type != XGLTYPE and t is not None:
-            # Patch the TarInfo object from the next header with
-            # the pax header's information.
-            for keyword, value in pax_headers.items():
-                if keyword in PAX_FIELDS:
-                    setattr(t, keyword, value)
-                pax_headers[keyword] = value
-            t.pax_headers = pax_headers.copy()
+        if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
+            if next is None:
+                raise HeaderError("missing subsequent header")
 
-        return t
+            # Patch the TarInfo object with the extended header info.
+            next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
+            next.offset = self.offset
+
+            if pax_headers.has_key("size"):
+                # If the extended header replaces the size field,
+                # we need to recalculate the offset where the next
+                # header starts.
+                offset = next.offset_data
+                if next.isreg() or next.type not in SUPPORTED_TYPES:
+                    offset += next._block(next.size)
+                tarfile.offset = offset
+
+        return next
+
+    def _apply_pax_info(self, pax_headers, encoding, errors):
+        """Replace fields with supplemental information from a previous
+           pax extended or global header.
+        """
+        for keyword, value in pax_headers.iteritems():
+            if keyword not in PAX_FIELDS:
+                continue
+
+            if keyword == "path":
+                value = value.rstrip("/")
+
+            if keyword in PAX_NUMBER_FIELDS:
+                try:
+                    value = PAX_NUMBER_FIELDS[keyword](value)
+                except ValueError:
+                    value = 0
+            else:
+                value = uts(value, encoding, errors)
+
+            setattr(self, keyword, value)
+
+        self.pax_headers = pax_headers.copy()
 
     def _block(self, count):
         """Round up a byte count by BLOCKSIZE and return it,
@@ -1462,8 +1485,9 @@
 
     format = DEFAULT_FORMAT     # The format to use when creating an archive.
 
-    encoding = ENCODING         # Transfer UTF-8 strings from POSIX.1-2001
-                                # headers to this encoding.
+    encoding = ENCODING         # Encoding for 8-bit character strings.
+
+    errors = None               # Error handler for unicode conversion.
 
     tarinfo = TarInfo           # The default TarInfo class to use.
 
@@ -1471,7 +1495,7 @@
 
     def __init__(self, name=None, mode="r", fileobj=None, format=None,
             tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
-            pax_headers=None, debug=None, errorlevel=None):
+            errors=None, pax_headers=None, debug=None, errorlevel=None):
         """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
            read from an existing archive, 'a' to append data to an existing
            file or 'w' to create a new file overwriting an existing one. `mode'
@@ -1512,6 +1536,19 @@
             self.ignore_zeros = ignore_zeros
         if encoding is not None:
             self.encoding = encoding
+
+        if errors is not None:
+            self.errors = errors
+        elif mode == "r":
+            self.errors = "utf-8"
+        else:
+            self.errors = "strict"
+
+        if pax_headers is not None and self.format == PAX_FORMAT:
+            self.pax_headers = pax_headers
+        else:
+            self.pax_headers = {}
+
         if debug is not None:
             self.debug = debug
         if errorlevel is not None:
@@ -1524,7 +1561,6 @@
         self.offset = 0L        # current position in the archive file
         self.inodes = {}        # dictionary caching the inodes of
                                 # archive members already added
-        self.pax_headers = {}   # save contents of global pax headers
 
         if self.mode == "r":
             self.firstmember = None
@@ -1543,9 +1579,8 @@
         if self.mode in "aw":
             self._loaded = True
 
-            if pax_headers:
-                buf = self.tarinfo.create_pax_global_header(
-                        pax_headers.copy(), self.encoding)
+            if self.pax_headers:
+                buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
                 self.fileobj.write(buf)
                 self.offset += len(buf)
 
@@ -1817,8 +1852,6 @@
                     self.inodes[inode] = arcname
         elif stat.S_ISDIR(stmd):
             type = DIRTYPE
-            if arcname[-1:] != "/":
-                arcname += "/"
         elif stat.S_ISFIFO(stmd):
             type = FIFOTYPE
         elif stat.S_ISLNK(stmd):
@@ -1952,7 +1985,7 @@
 
         tarinfo = copy.copy(tarinfo)
 
-        buf = tarinfo.tobuf(self.format, self.encoding)
+        buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
         self.fileobj.write(buf)
         self.offset += len(buf)