Issue #24838: tarfile's ustar and gnu formats now correctly calculate name and link field limits for multibyte character encodings like utf-8.

commit: 0f450abec432763b92d6a9b1a778e8c0e5232338 [log] [tgz]
author: Lars Gustäbel <lars@gustaebel.de> Tue Apr 19 08:43:17 2016 +0200
committer: Lars Gustäbel <lars@gustaebel.de> Tue Apr 19 08:43:17 2016 +0200
tree: 992da577543eff31b83c6558ea96bf634db05b40
parent: 472233ec835bfaaf1419c74956a0e64797a6a0c2 [diff] [blame]
diff --git a/Lib/tarfile.py b/Lib/tarfile.py
index 523620e..86e1cf9 100755
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py

@@ -812,11 +812,11 @@
         """
         info["magic"] = POSIX_MAGIC
 
-        if len(info["linkname"]) > LENGTH_LINK:
+        if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
             raise ValueError("linkname is too long")
 
-        if len(info["name"]) > LENGTH_NAME:
-            info["prefix"], info["name"] = self._posix_split_name(info["name"])
+        if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
+            info["prefix"], info["name"] = self._posix_split_name(info["name"], encoding, errors)
 
         return self._create_header(info, USTAR_FORMAT, encoding, errors)
 
@@ -826,10 +826,10 @@
         info["magic"] = GNU_MAGIC
 
         buf = b""
-        if len(info["linkname"]) > LENGTH_LINK:
+        if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
             buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
 
-        if len(info["name"]) > LENGTH_NAME:
+        if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
             buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
 
         return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
@@ -889,19 +889,20 @@
         """
         return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
 
-    def _posix_split_name(self, name):
+    def _posix_split_name(self, name, encoding, errors):
         """Split a name longer than 100 chars into a prefix
            and a name part.
         """
-        prefix = name[:LENGTH_PREFIX + 1]
-        while prefix and prefix[-1] != "/":
-            prefix = prefix[:-1]
-
-        name = name[len(prefix):]
-        prefix = prefix[:-1]
-
-        if not prefix or len(name) > LENGTH_NAME:
+        components = name.split("/")
+        for i in range(1, len(components)):
+            prefix = "/".join(components[:i])
+            name = "/".join(components[i:])
+            if len(prefix.encode(encoding, errors)) <= LENGTH_PREFIX and \
+                    len(name.encode(encoding, errors)) <= LENGTH_NAME:
+                break
+        else:
             raise ValueError("name is too long")
+
         return prefix, name
 
     @staticmethod
commit	0f450abec432763b92d6a9b1a778e8c0e5232338	[log] [tgz]
author	Lars Gustäbel <lars@gustaebel.de>	Tue Apr 19 08:43:17 2016 +0200
committer	Lars Gustäbel <lars@gustaebel.de>	Tue Apr 19 08:43:17 2016 +0200
tree	992da577543eff31b83c6558ea96bf634db05b40
parent	472233ec835bfaaf1419c74956a0e64797a6a0c2 [diff] [blame]