blob: 999a99b978fced11befb850c3025c12a27064c99 [file] [log] [blame]
Benjamin Peterson90f5ba52010-03-11 22:53:45 +00001#!/usr/bin/env python3
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
Christian Heimes9c1257e2007-11-04 11:37:22 +00005# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00006# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
Guido van Rossumd8faa362007-04-27 19:54:29 +000032version = "0.9.0"
Guido van Rossum98297ee2007-11-06 21:34:58 +000033__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)"
Senthil Kumaran7c9719c2011-07-28 22:32:49 +080034__date__ = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $"
35__cvsid__ = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $"
Guido van Rossum98297ee2007-11-06 21:34:58 +000036__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000037
38#---------
39# Imports
40#---------
Serhiy Storchakacf4a2f22015-03-11 17:18:03 +020041from builtins import open as bltn_open
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000042import sys
43import os
Eli Bendersky74c503b2012-01-03 06:26:13 +020044import io
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000045import shutil
46import stat
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000047import time
48import struct
Thomas Wouters89f507f2006-12-13 04:49:30 +000049import copy
Guido van Rossumd8faa362007-04-27 19:54:29 +000050import re
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000051
52try:
53 import grp, pwd
Brett Cannoncd171c82013-07-04 17:43:24 -040054except ImportError:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000055 grp = pwd = None
56
Brian Curtin16633fa2010-07-09 13:54:27 +000057# os.symlink on Windows prior to 6.0 raises NotImplementedError
58symlink_exception = (AttributeError, NotImplementedError)
59try:
Andrew Svetlov2606a6f2012-12-19 14:33:35 +020060 # OSError (winerror=1314) will be raised if the caller does not hold the
Brian Curtin16633fa2010-07-09 13:54:27 +000061 # SeCreateSymbolicLinkPrivilege privilege
Andrew Svetlov2606a6f2012-12-19 14:33:35 +020062 symlink_exception += (OSError,)
Brian Curtin16633fa2010-07-09 13:54:27 +000063except NameError:
64 pass
65
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000066# from tarfile import *
67__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
68
69#---------------------------------------------------------
70# tar constants
71#---------------------------------------------------------
Lars Gustäbelb506dc32007-08-07 18:36:16 +000072NUL = b"\0" # the null character
Guido van Rossumd8faa362007-04-27 19:54:29 +000073BLOCKSIZE = 512 # length of processing blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000074RECORDSIZE = BLOCKSIZE * 20 # length of records
Lars Gustäbelb506dc32007-08-07 18:36:16 +000075GNU_MAGIC = b"ustar \0" # magic gnu tar string
76POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000077
Guido van Rossumd8faa362007-04-27 19:54:29 +000078LENGTH_NAME = 100 # maximum length of a filename
79LENGTH_LINK = 100 # maximum length of a linkname
80LENGTH_PREFIX = 155 # maximum length of the prefix field
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000081
Lars Gustäbelb506dc32007-08-07 18:36:16 +000082REGTYPE = b"0" # regular file
83AREGTYPE = b"\0" # regular file
84LNKTYPE = b"1" # link (inside tarfile)
85SYMTYPE = b"2" # symbolic link
86CHRTYPE = b"3" # character special device
87BLKTYPE = b"4" # block special device
88DIRTYPE = b"5" # directory
89FIFOTYPE = b"6" # fifo special device
90CONTTYPE = b"7" # contiguous file
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000091
Lars Gustäbelb506dc32007-08-07 18:36:16 +000092GNUTYPE_LONGNAME = b"L" # GNU tar longname
93GNUTYPE_LONGLINK = b"K" # GNU tar longlink
94GNUTYPE_SPARSE = b"S" # GNU tar sparse file
Guido van Rossumd8faa362007-04-27 19:54:29 +000095
Lars Gustäbelb506dc32007-08-07 18:36:16 +000096XHDTYPE = b"x" # POSIX.1-2001 extended header
97XGLTYPE = b"g" # POSIX.1-2001 global header
98SOLARIS_XHDTYPE = b"X" # Solaris extended header
Guido van Rossumd8faa362007-04-27 19:54:29 +000099
100USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
101GNU_FORMAT = 1 # GNU tar format
102PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
103DEFAULT_FORMAT = GNU_FORMAT
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000104
105#---------------------------------------------------------
106# tarfile constants
107#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000108# File types that tarfile supports:
109SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
110 SYMTYPE, DIRTYPE, FIFOTYPE,
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000111 CONTTYPE, CHRTYPE, BLKTYPE,
112 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
113 GNUTYPE_SPARSE)
114
Guido van Rossumd8faa362007-04-27 19:54:29 +0000115# File types that will be treated as a regular file.
116REGULAR_TYPES = (REGTYPE, AREGTYPE,
117 CONTTYPE, GNUTYPE_SPARSE)
118
119# File types that are part of the GNU tar format.
120GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
121 GNUTYPE_SPARSE)
122
123# Fields from a pax header that override a TarInfo attribute.
124PAX_FIELDS = ("path", "linkpath", "size", "mtime",
125 "uid", "gid", "uname", "gname")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000126
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000127# Fields from a pax header that are affected by hdrcharset.
128PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
129
Guido van Rossume7ba4952007-06-06 23:52:48 +0000130# Fields in a pax header that are numbers, all other fields
131# are treated as strings.
132PAX_NUMBER_FIELDS = {
133 "atime": float,
134 "ctime": float,
135 "mtime": float,
136 "uid": int,
137 "gid": int,
138 "size": int
139}
140
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000141#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000142# initialization
143#---------------------------------------------------------
Victor Stinner0f35e2c2010-06-11 23:46:47 +0000144if os.name in ("nt", "ce"):
145 ENCODING = "utf-8"
146else:
147 ENCODING = sys.getfilesystemencoding()
Guido van Rossumd8faa362007-04-27 19:54:29 +0000148
149#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000150# Some useful functions
151#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000152
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000153def stn(s, length, encoding, errors):
154 """Convert a string to a null-terminated bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000155 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000156 s = s.encode(encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000157 return s[:length] + (length - len(s)) * NUL
Thomas Wouters477c8d52006-05-27 19:21:47 +0000158
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000159def nts(s, encoding, errors):
160 """Convert a null-terminated bytes object to a string.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000161 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000162 p = s.find(b"\0")
163 if p != -1:
164 s = s[:p]
165 return s.decode(encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000166
Thomas Wouters477c8d52006-05-27 19:21:47 +0000167def nti(s):
168 """Convert a number field to a python number.
169 """
170 # There are two possible encodings for a number field, see
171 # itn() below.
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200172 if s[0] in (0o200, 0o377):
173 n = 0
174 for i in range(len(s) - 1):
175 n <<= 8
176 n += s[i + 1]
177 if s[0] == 0o377:
178 n = -(256 ** (len(s) - 1) - n)
179 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000180 try:
Lars Gustäbelb7a688b2015-07-02 19:38:38 +0200181 s = nts(s, "ascii", "strict")
182 n = int(s.strip() or "0", 8)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000183 except ValueError:
Lars Gustäbel9520a432009-11-22 18:48:49 +0000184 raise InvalidHeaderError("invalid header")
Thomas Wouters477c8d52006-05-27 19:21:47 +0000185 return n
186
Guido van Rossumd8faa362007-04-27 19:54:29 +0000187def itn(n, digits=8, format=DEFAULT_FORMAT):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000188 """Convert a python number to a number field.
189 """
190 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
191 # octal digits followed by a null-byte, this allows values up to
192 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200193 # that if necessary. A leading 0o200 or 0o377 byte indicate this
194 # particular encoding, the following digits-1 bytes are a big-endian
195 # base-256 representation. This allows values up to (256**(digits-1))-1.
196 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
197 # number.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000198 if 0 <= n < 8 ** (digits - 1):
Ethan Furmandf3ed242014-01-05 06:50:30 -0800199 s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200200 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
201 if n >= 0:
202 s = bytearray([0o200])
203 else:
204 s = bytearray([0o377])
205 n = 256 ** digits + n
Thomas Wouters477c8d52006-05-27 19:21:47 +0000206
Guido van Rossum805365e2007-05-07 22:24:25 +0000207 for i in range(digits - 1):
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200208 s.insert(1, n & 0o377)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209 n >>= 8
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200210 else:
211 raise ValueError("overflow in number field")
212
Thomas Wouters477c8d52006-05-27 19:21:47 +0000213 return s
214
215def calc_chksums(buf):
216 """Calculate the checksum for a member's header by summing up all
217 characters except for the chksum field which is treated as if
218 it was filled with spaces. According to the GNU tar sources,
219 some tars (Sun and NeXT) calculate chksum with signed char,
220 which will be different if there are chars in the buffer with
221 the high bit set. So we calculate two checksums, unsigned and
222 signed.
223 """
Ross Lagerwall468ff4c2012-05-17 19:49:27 +0200224 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
225 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000226 return unsigned_chksum, signed_chksum
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000227
Lars Gustäbel03572682015-07-06 09:27:24 +0200228def copyfileobj(src, dst, length=None, exception=OSError):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000229 """Copy length bytes from fileobj src to fileobj dst.
230 If length is None, copy the entire content.
231 """
232 if length == 0:
233 return
234 if length is None:
235 shutil.copyfileobj(src, dst)
236 return
237
238 BUFSIZE = 16 * 1024
239 blocks, remainder = divmod(length, BUFSIZE)
Guido van Rossum805365e2007-05-07 22:24:25 +0000240 for b in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000241 buf = src.read(BUFSIZE)
242 if len(buf) < BUFSIZE:
Lars Gustäbel03572682015-07-06 09:27:24 +0200243 raise exception("unexpected end of data")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000244 dst.write(buf)
245
246 if remainder != 0:
247 buf = src.read(remainder)
248 if len(buf) < remainder:
Lars Gustäbel03572682015-07-06 09:27:24 +0200249 raise exception("unexpected end of data")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000250 dst.write(buf)
251 return
252
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000253def filemode(mode):
Giampaolo Rodola'ffa1d0b2012-05-15 15:30:25 +0200254 """Deprecated in this location; use stat.filemode."""
255 import warnings
256 warnings.warn("deprecated in favor of stat.filemode",
257 DeprecationWarning, 2)
258 return stat.filemode(mode)
259
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +0200260def _safe_print(s):
261 encoding = getattr(sys.stdout, 'encoding', None)
262 if encoding is not None:
263 s = s.encode(encoding, 'backslashreplace').decode(encoding)
264 print(s, end=' ')
265
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000266
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000267class TarError(Exception):
268 """Base exception."""
269 pass
270class ExtractError(TarError):
271 """General exception for extract errors."""
272 pass
273class ReadError(TarError):
Ezio Melotti30b9d5d2013-08-17 15:50:46 +0300274 """Exception for unreadable tar archives."""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000275 pass
276class CompressionError(TarError):
277 """Exception for unavailable compression methods."""
278 pass
279class StreamError(TarError):
280 """Exception for unsupported operations on stream-like TarFiles."""
281 pass
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000282class HeaderError(TarError):
Lars Gustäbel9520a432009-11-22 18:48:49 +0000283 """Base exception for header errors."""
284 pass
285class EmptyHeaderError(HeaderError):
286 """Exception for empty headers."""
287 pass
288class TruncatedHeaderError(HeaderError):
289 """Exception for truncated headers."""
290 pass
291class EOFHeaderError(HeaderError):
292 """Exception for end of file headers."""
293 pass
294class InvalidHeaderError(HeaderError):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000295 """Exception for invalid headers."""
296 pass
Lars Gustäbel9520a432009-11-22 18:48:49 +0000297class SubsequentHeaderError(HeaderError):
298 """Exception for missing and invalid extended headers."""
299 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000300
301#---------------------------
302# internal stream interface
303#---------------------------
304class _LowLevelFile:
305 """Low-level file object. Supports reading and writing.
306 It is used instead of a regular file object for streaming
307 access.
308 """
309
310 def __init__(self, name, mode):
311 mode = {
312 "r": os.O_RDONLY,
313 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
314 }[mode]
315 if hasattr(os, "O_BINARY"):
316 mode |= os.O_BINARY
Lars Gustäbeld6eb70b2010-04-29 15:37:02 +0000317 self.fd = os.open(name, mode, 0o666)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000318
319 def close(self):
320 os.close(self.fd)
321
322 def read(self, size):
323 return os.read(self.fd, size)
324
325 def write(self, s):
326 os.write(self.fd, s)
327
328class _Stream:
329 """Class that serves as an adapter between TarFile and
330 a stream-like object. The stream-like object only
331 needs to have a read() or write() method and is accessed
332 blockwise. Use of gzip or bzip2 compression is possible.
333 A stream-like object could be for example: sys.stdin,
334 sys.stdout, a socket, a tape device etc.
335
336 _Stream is intended to be used only internally.
337 """
338
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000339 def __init__(self, name, mode, comptype, fileobj, bufsize):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000340 """Construct a _Stream object.
341 """
342 self._extfileobj = True
343 if fileobj is None:
344 fileobj = _LowLevelFile(name, mode)
345 self._extfileobj = False
346
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000347 if comptype == '*':
348 # Enable transparent compression detection for the
349 # stream interface
350 fileobj = _StreamProxy(fileobj)
351 comptype = fileobj.getcomptype()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000352
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000353 self.name = name or ""
354 self.mode = mode
355 self.comptype = comptype
356 self.fileobj = fileobj
357 self.bufsize = bufsize
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000358 self.buf = b""
Guido van Rossume2a383d2007-01-15 16:59:06 +0000359 self.pos = 0
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000360 self.closed = False
361
Antoine Pitrou605c2932010-09-23 20:15:14 +0000362 try:
363 if comptype == "gz":
364 try:
365 import zlib
Brett Cannoncd171c82013-07-04 17:43:24 -0400366 except ImportError:
Antoine Pitrou605c2932010-09-23 20:15:14 +0000367 raise CompressionError("zlib module is not available")
368 self.zlib = zlib
369 self.crc = zlib.crc32(b"")
370 if mode == "r":
371 self._init_read_gz()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100372 self.exception = zlib.error
Antoine Pitrou605c2932010-09-23 20:15:14 +0000373 else:
374 self._init_write_gz()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000375
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100376 elif comptype == "bz2":
Antoine Pitrou605c2932010-09-23 20:15:14 +0000377 try:
378 import bz2
Brett Cannoncd171c82013-07-04 17:43:24 -0400379 except ImportError:
Antoine Pitrou605c2932010-09-23 20:15:14 +0000380 raise CompressionError("bz2 module is not available")
381 if mode == "r":
382 self.dbuf = b""
383 self.cmp = bz2.BZ2Decompressor()
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200384 self.exception = OSError
Antoine Pitrou605c2932010-09-23 20:15:14 +0000385 else:
386 self.cmp = bz2.BZ2Compressor()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100387
388 elif comptype == "xz":
389 try:
390 import lzma
Brett Cannoncd171c82013-07-04 17:43:24 -0400391 except ImportError:
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100392 raise CompressionError("lzma module is not available")
393 if mode == "r":
394 self.dbuf = b""
395 self.cmp = lzma.LZMADecompressor()
396 self.exception = lzma.LZMAError
397 else:
398 self.cmp = lzma.LZMACompressor()
399
400 elif comptype != "tar":
401 raise CompressionError("unknown compression type %r" % comptype)
402
Antoine Pitrou605c2932010-09-23 20:15:14 +0000403 except:
404 if not self._extfileobj:
405 self.fileobj.close()
406 self.closed = True
407 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000408
409 def __del__(self):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000410 if hasattr(self, "closed") and not self.closed:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000411 self.close()
412
413 def _init_write_gz(self):
414 """Initialize for writing with gzip compression.
415 """
416 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
417 -self.zlib.MAX_WBITS,
418 self.zlib.DEF_MEM_LEVEL,
419 0)
Guido van Rossume2a383d2007-01-15 16:59:06 +0000420 timestamp = struct.pack("<L", int(time.time()))
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000421 self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000422 if self.name.endswith(".gz"):
423 self.name = self.name[:-3]
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000424 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
425 self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000426
427 def write(self, s):
428 """Write string s to the stream.
429 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000430 if self.comptype == "gz":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000431 self.crc = self.zlib.crc32(s, self.crc)
432 self.pos += len(s)
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000433 if self.comptype != "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000434 s = self.cmp.compress(s)
435 self.__write(s)
436
437 def __write(self, s):
438 """Write string s to the stream if a whole new block
439 is ready to be written.
440 """
441 self.buf += s
442 while len(self.buf) > self.bufsize:
443 self.fileobj.write(self.buf[:self.bufsize])
444 self.buf = self.buf[self.bufsize:]
445
446 def close(self):
447 """Close the _Stream object. No operation should be
448 done on it afterwards.
449 """
450 if self.closed:
451 return
452
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000453 self.closed = True
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300454 try:
455 if self.mode == "w" and self.comptype != "tar":
456 self.buf += self.cmp.flush()
457
458 if self.mode == "w" and self.buf:
459 self.fileobj.write(self.buf)
460 self.buf = b""
461 if self.comptype == "gz":
Martin Panterb82032f2015-12-11 05:19:29 +0000462 self.fileobj.write(struct.pack("<L", self.crc))
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300463 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
464 finally:
465 if not self._extfileobj:
466 self.fileobj.close()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000467
468 def _init_read_gz(self):
469 """Initialize for reading a gzip compressed fileobj.
470 """
471 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000472 self.dbuf = b""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000473
474 # taken from gzip.GzipFile with some alterations
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000475 if self.__read(2) != b"\037\213":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000476 raise ReadError("not a gzip file")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000477 if self.__read(1) != b"\010":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000478 raise CompressionError("unsupported compression method")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000479
480 flag = ord(self.__read(1))
481 self.__read(6)
482
483 if flag & 4:
484 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
485 self.read(xlen)
486 if flag & 8:
487 while True:
488 s = self.__read(1)
489 if not s or s == NUL:
490 break
491 if flag & 16:
492 while True:
493 s = self.__read(1)
494 if not s or s == NUL:
495 break
496 if flag & 2:
497 self.__read(2)
498
499 def tell(self):
500 """Return the stream's file pointer position.
501 """
502 return self.pos
503
504 def seek(self, pos=0):
505 """Set the stream's file pointer to pos. Negative seeking
506 is forbidden.
507 """
508 if pos - self.pos >= 0:
509 blocks, remainder = divmod(pos - self.pos, self.bufsize)
Guido van Rossum805365e2007-05-07 22:24:25 +0000510 for i in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000511 self.read(self.bufsize)
512 self.read(remainder)
513 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000514 raise StreamError("seeking backwards is not allowed")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000515 return self.pos
516
517 def read(self, size=None):
518 """Return the next size number of bytes from the stream.
519 If size is not defined, return all bytes of the stream
520 up to EOF.
521 """
522 if size is None:
523 t = []
524 while True:
525 buf = self._read(self.bufsize)
526 if not buf:
527 break
528 t.append(buf)
529 buf = "".join(t)
530 else:
531 buf = self._read(size)
532 self.pos += len(buf)
533 return buf
534
535 def _read(self, size):
536 """Return size bytes from the stream.
537 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000538 if self.comptype == "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000539 return self.__read(size)
540
541 c = len(self.dbuf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000542 while c < size:
543 buf = self.__read(self.bufsize)
544 if not buf:
545 break
Guido van Rossumd8faa362007-04-27 19:54:29 +0000546 try:
547 buf = self.cmp.decompress(buf)
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100548 except self.exception:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000549 raise ReadError("invalid compressed data")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000550 self.dbuf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000551 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000552 buf = self.dbuf[:size]
553 self.dbuf = self.dbuf[size:]
554 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000555
556 def __read(self, size):
557 """Return size bytes from stream. If internal buffer is empty,
558 read another block from the stream.
559 """
560 c = len(self.buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000561 while c < size:
562 buf = self.fileobj.read(self.bufsize)
563 if not buf:
564 break
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000565 self.buf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000566 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000567 buf = self.buf[:size]
568 self.buf = self.buf[size:]
569 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000570# class _Stream
571
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000572class _StreamProxy(object):
573 """Small proxy class that enables transparent compression
574 detection for the Stream interface (mode 'r|*').
575 """
576
577 def __init__(self, fileobj):
578 self.fileobj = fileobj
579 self.buf = self.fileobj.read(BLOCKSIZE)
580
581 def read(self, size):
582 self.read = self.fileobj.read
583 return self.buf
584
585 def getcomptype(self):
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100586 if self.buf.startswith(b"\x1f\x8b\x08"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000587 return "gz"
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100588 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000589 return "bz2"
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100590 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
591 return "xz"
592 else:
593 return "tar"
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000594
595 def close(self):
596 self.fileobj.close()
597# class StreamProxy
598
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000599#------------------------
600# Extraction file object
601#------------------------
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000602class _FileInFile(object):
603 """A thin wrapper around an existing file object that
604 provides a part of its data as an individual file
605 object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000606 """
607
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000608 def __init__(self, fileobj, offset, size, blockinfo=None):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000609 self.fileobj = fileobj
610 self.offset = offset
611 self.size = size
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000612 self.position = 0
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200613 self.name = getattr(fileobj, "name", None)
614 self.closed = False
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000615
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000616 if blockinfo is None:
617 blockinfo = [(0, size)]
618
619 # Construct a map with data and zero blocks.
620 self.map_index = 0
621 self.map = []
622 lastpos = 0
623 realpos = self.offset
624 for offset, size in blockinfo:
625 if offset > lastpos:
626 self.map.append((False, lastpos, offset, None))
627 self.map.append((True, offset, offset + size, realpos))
628 realpos += size
629 lastpos = offset + size
630 if lastpos < self.size:
631 self.map.append((False, lastpos, self.size, None))
632
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200633 def flush(self):
634 pass
635
636 def readable(self):
637 return True
638
639 def writable(self):
640 return False
641
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000642 def seekable(self):
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000643 return self.fileobj.seekable()
644
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000645 def tell(self):
646 """Return the current file position.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000647 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000648 return self.position
649
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200650 def seek(self, position, whence=io.SEEK_SET):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000651 """Seek to a position in the file.
652 """
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200653 if whence == io.SEEK_SET:
654 self.position = min(max(position, 0), self.size)
655 elif whence == io.SEEK_CUR:
656 if position < 0:
657 self.position = max(self.position + position, 0)
658 else:
659 self.position = min(self.position + position, self.size)
660 elif whence == io.SEEK_END:
661 self.position = max(min(self.size + position, self.size), 0)
662 else:
663 raise ValueError("Invalid argument")
664 return self.position
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000665
666 def read(self, size=None):
667 """Read data from the file.
668 """
669 if size is None:
670 size = self.size - self.position
671 else:
672 size = min(size, self.size - self.position)
673
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000674 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000675 while size > 0:
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000676 while True:
677 data, start, stop, offset = self.map[self.map_index]
678 if start <= self.position < stop:
679 break
680 else:
681 self.map_index += 1
682 if self.map_index == len(self.map):
683 self.map_index = 0
684 length = min(size, stop - self.position)
685 if data:
Lars Gustäbeldd071042011-02-23 11:42:22 +0000686 self.fileobj.seek(offset + (self.position - start))
Lars Gustäbel03572682015-07-06 09:27:24 +0200687 b = self.fileobj.read(length)
688 if len(b) != length:
689 raise ReadError("unexpected end of data")
690 buf += b
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000691 else:
692 buf += NUL * length
693 size -= length
694 self.position += length
695 return buf
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000696
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200697 def readinto(self, b):
698 buf = self.read(len(b))
699 b[:len(buf)] = buf
700 return len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000701
702 def close(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000703 self.closed = True
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200704#class _FileInFile
Martin v. Löwisdf241532005-03-03 08:17:42 +0000705
Lars Gustäbelb062a2f2012-05-14 13:18:16 +0200706class ExFileObject(io.BufferedReader):
707
708 def __init__(self, tarfile, tarinfo):
709 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
710 tarinfo.size, tarinfo.sparse)
711 super().__init__(fileobj)
712#class ExFileObject
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000713
714#------------------
715# Exported Classes
716#------------------
717class TarInfo(object):
718 """Informational class which holds the details about an
719 archive member given by a tar header block.
720 TarInfo objects are returned by TarFile.getmember(),
721 TarFile.getmembers() and TarFile.gettarinfo() and are
722 usually created internally.
723 """
724
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000725 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
726 "chksum", "type", "linkname", "uname", "gname",
727 "devmajor", "devminor",
728 "offset", "offset_data", "pax_headers", "sparse",
729 "tarfile", "_sparse_structs", "_link_target")
730
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000731 def __init__(self, name=""):
732 """Construct a TarInfo object. name is the optional name
733 of the member.
734 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000735 self.name = name # member name
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000736 self.mode = 0o644 # file permissions
Thomas Wouters477c8d52006-05-27 19:21:47 +0000737 self.uid = 0 # user id
738 self.gid = 0 # group id
739 self.size = 0 # file size
740 self.mtime = 0 # modification time
741 self.chksum = 0 # header checksum
742 self.type = REGTYPE # member type
743 self.linkname = "" # link name
Lars Gustäbel331b8002010-10-04 15:18:47 +0000744 self.uname = "" # user name
745 self.gname = "" # group name
Thomas Wouters477c8d52006-05-27 19:21:47 +0000746 self.devmajor = 0 # device major number
747 self.devminor = 0 # device minor number
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000748
Thomas Wouters477c8d52006-05-27 19:21:47 +0000749 self.offset = 0 # the tar header starts here
750 self.offset_data = 0 # the file's data starts here
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000751
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000752 self.sparse = None # sparse member information
Guido van Rossumd8faa362007-04-27 19:54:29 +0000753 self.pax_headers = {} # pax header information
754
755 # In pax headers the "name" and "linkname" field are called
756 # "path" and "linkpath".
757 def _getpath(self):
758 return self.name
759 def _setpath(self, name):
760 self.name = name
761 path = property(_getpath, _setpath)
762
763 def _getlinkpath(self):
764 return self.linkname
765 def _setlinkpath(self, linkname):
766 self.linkname = linkname
767 linkpath = property(_getlinkpath, _setlinkpath)
768
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000769 def __repr__(self):
770 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
771
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000772 def get_info(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000773 """Return the TarInfo's attributes as a dictionary.
774 """
775 info = {
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000776 "name": self.name,
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000777 "mode": self.mode & 0o7777,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000778 "uid": self.uid,
779 "gid": self.gid,
780 "size": self.size,
781 "mtime": self.mtime,
782 "chksum": self.chksum,
783 "type": self.type,
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000784 "linkname": self.linkname,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000785 "uname": self.uname,
786 "gname": self.gname,
787 "devmajor": self.devmajor,
788 "devminor": self.devminor
789 }
790
791 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
792 info["name"] += "/"
793
794 return info
795
Victor Stinnerde629d42010-05-05 21:43:57 +0000796 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000797 """Return a tar header as a string of 512 byte blocks.
798 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000799 info = self.get_info()
Guido van Rossume7ba4952007-06-06 23:52:48 +0000800
Guido van Rossumd8faa362007-04-27 19:54:29 +0000801 if format == USTAR_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000802 return self.create_ustar_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000803 elif format == GNU_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000804 return self.create_gnu_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000805 elif format == PAX_FORMAT:
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000806 return self.create_pax_header(info, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000807 else:
808 raise ValueError("invalid format")
809
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000810 def create_ustar_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000811 """Return the object as a ustar header block.
812 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000813 info["magic"] = POSIX_MAGIC
814
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200815 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000816 raise ValueError("linkname is too long")
817
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200818 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
819 info["prefix"], info["name"] = self._posix_split_name(info["name"], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000820
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000821 return self._create_header(info, USTAR_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000822
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000823 def create_gnu_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000824 """Return the object as a GNU header block sequence.
825 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000826 info["magic"] = GNU_MAGIC
827
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000828 buf = b""
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200829 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000830 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000831
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200832 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000833 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000834
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000835 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000836
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000837 def create_pax_header(self, info, encoding):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000838 """Return the object as a ustar header block. If it cannot be
839 represented this way, prepend a pax extended header sequence
840 with supplement information.
841 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000842 info["magic"] = POSIX_MAGIC
843 pax_headers = self.pax_headers.copy()
844
845 # Test string fields for values that exceed the field length or cannot
846 # be represented in ASCII encoding.
847 for name, hname, length in (
848 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
849 ("uname", "uname", 32), ("gname", "gname", 32)):
850
Guido van Rossume7ba4952007-06-06 23:52:48 +0000851 if hname in pax_headers:
852 # The pax header has priority.
853 continue
854
Guido van Rossumd8faa362007-04-27 19:54:29 +0000855 # Try to encode the string as ASCII.
856 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000857 info[name].encode("ascii", "strict")
Guido van Rossumd8faa362007-04-27 19:54:29 +0000858 except UnicodeEncodeError:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000859 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +0000860 continue
861
Guido van Rossume7ba4952007-06-06 23:52:48 +0000862 if len(info[name]) > length:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000863 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +0000864
865 # Test number fields for values that exceed the field limit or values
866 # that like to be stored as float.
867 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
Guido van Rossume7ba4952007-06-06 23:52:48 +0000868 if name in pax_headers:
869 # The pax header has priority. Avoid overflow.
870 info[name] = 0
871 continue
872
Guido van Rossumd8faa362007-04-27 19:54:29 +0000873 val = info[name]
874 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000875 pax_headers[name] = str(val)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000876 info[name] = 0
877
Guido van Rossume7ba4952007-06-06 23:52:48 +0000878 # Create a pax extended header if necessary.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000879 if pax_headers:
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000880 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000881 else:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000882 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +0000883
Lars Gustäbel3741eff2007-08-21 12:17:05 +0000884 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
Guido van Rossumd8faa362007-04-27 19:54:29 +0000885
886 @classmethod
Lars Gustäbel3741eff2007-08-21 12:17:05 +0000887 def create_pax_global_header(cls, pax_headers):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000888 """Return the object as a pax global header block sequence.
889 """
Marc-André Lemburg8f36af72011-02-25 15:42:01 +0000890 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
Guido van Rossumd8faa362007-04-27 19:54:29 +0000891
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200892 def _posix_split_name(self, name, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000893 """Split a name longer than 100 chars into a prefix
894 and a name part.
895 """
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200896 components = name.split("/")
897 for i in range(1, len(components)):
898 prefix = "/".join(components[:i])
899 name = "/".join(components[i:])
900 if len(prefix.encode(encoding, errors)) <= LENGTH_PREFIX and \
901 len(name.encode(encoding, errors)) <= LENGTH_NAME:
902 break
903 else:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000904 raise ValueError("name is too long")
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200905
Guido van Rossumd8faa362007-04-27 19:54:29 +0000906 return prefix, name
907
908 @staticmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000909 def _create_header(info, format, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000910 """Return a header block. info is a dictionary with file
911 information, format must be one of the *_FORMAT constants.
912 """
913 parts = [
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000914 stn(info.get("name", ""), 100, encoding, errors),
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000915 itn(info.get("mode", 0) & 0o7777, 8, format),
Guido van Rossumd8faa362007-04-27 19:54:29 +0000916 itn(info.get("uid", 0), 8, format),
917 itn(info.get("gid", 0), 8, format),
918 itn(info.get("size", 0), 12, format),
919 itn(info.get("mtime", 0), 12, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000920 b" ", # checksum field
Guido van Rossumd8faa362007-04-27 19:54:29 +0000921 info.get("type", REGTYPE),
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000922 stn(info.get("linkname", ""), 100, encoding, errors),
923 info.get("magic", POSIX_MAGIC),
Lars Gustäbel331b8002010-10-04 15:18:47 +0000924 stn(info.get("uname", ""), 32, encoding, errors),
925 stn(info.get("gname", ""), 32, encoding, errors),
Guido van Rossumd8faa362007-04-27 19:54:29 +0000926 itn(info.get("devmajor", 0), 8, format),
927 itn(info.get("devminor", 0), 8, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000928 stn(info.get("prefix", ""), 155, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000929 ]
930
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000931 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
Guido van Rossumd8faa362007-04-27 19:54:29 +0000932 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
Lars Gustäbela280ca752007-08-28 07:34:33 +0000933 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
Guido van Rossumd8faa362007-04-27 19:54:29 +0000934 return buf
935
936 @staticmethod
937 def _create_payload(payload):
938 """Return the string payload filled with zero bytes
939 up to the next 512 byte border.
940 """
941 blocks, remainder = divmod(len(payload), BLOCKSIZE)
942 if remainder > 0:
943 payload += (BLOCKSIZE - remainder) * NUL
944 return payload
945
946 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000947 def _create_gnu_long_header(cls, name, type, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000948 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
949 for name.
950 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000951 name = name.encode(encoding, errors) + NUL
Guido van Rossumd8faa362007-04-27 19:54:29 +0000952
953 info = {}
954 info["name"] = "././@LongLink"
955 info["type"] = type
956 info["size"] = len(name)
957 info["magic"] = GNU_MAGIC
958
959 # create extended header + name blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000960 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
Guido van Rossumd8faa362007-04-27 19:54:29 +0000961 cls._create_payload(name)
962
963 @classmethod
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000964 def _create_pax_generic_header(cls, pax_headers, type, encoding):
965 """Return a POSIX.1-2008 extended or global header sequence
Guido van Rossumd8faa362007-04-27 19:54:29 +0000966 that contains a list of keyword, value pairs. The values
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000967 must be strings.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000968 """
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000969 # Check if one of the fields contains surrogate characters and thereby
970 # forces hdrcharset=BINARY, see _proc_pax() for more information.
971 binary = False
972 for keyword, value in pax_headers.items():
973 try:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +0000974 value.encode("utf-8", "strict")
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000975 except UnicodeEncodeError:
976 binary = True
977 break
978
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000979 records = b""
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000980 if binary:
981 # Put the hdrcharset field at the beginning of the header.
982 records += b"21 hdrcharset=BINARY\n"
983
Guido van Rossumd8faa362007-04-27 19:54:29 +0000984 for keyword, value in pax_headers.items():
Marc-André Lemburg8f36af72011-02-25 15:42:01 +0000985 keyword = keyword.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000986 if binary:
987 # Try to restore the original byte representation of `value'.
988 # Needless to say, that the encoding must match the string.
989 value = value.encode(encoding, "surrogateescape")
990 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +0000991 value = value.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000992
Guido van Rossumd8faa362007-04-27 19:54:29 +0000993 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
994 n = p = 0
995 while True:
996 n = l + len(str(p))
997 if n == p:
998 break
999 p = n
Lars Gustäbela280ca752007-08-28 07:34:33 +00001000 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
Guido van Rossumd8faa362007-04-27 19:54:29 +00001001
1002 # We use a hardcoded "././@PaxHeader" name like star does
1003 # instead of the one that POSIX recommends.
1004 info = {}
1005 info["name"] = "././@PaxHeader"
1006 info["type"] = type
1007 info["size"] = len(records)
1008 info["magic"] = POSIX_MAGIC
1009
1010 # Create pax header + record blocks.
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001011 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001012 cls._create_payload(records)
1013
Guido van Rossum75b64e62005-01-16 00:16:11 +00001014 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001015 def frombuf(cls, buf, encoding, errors):
1016 """Construct a TarInfo object from a 512 byte bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001017 """
Lars Gustäbel9520a432009-11-22 18:48:49 +00001018 if len(buf) == 0:
1019 raise EmptyHeaderError("empty header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001020 if len(buf) != BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001021 raise TruncatedHeaderError("truncated header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001022 if buf.count(NUL) == BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001023 raise EOFHeaderError("end of file header")
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001024
1025 chksum = nti(buf[148:156])
1026 if chksum not in calc_chksums(buf):
Lars Gustäbel9520a432009-11-22 18:48:49 +00001027 raise InvalidHeaderError("bad checksum")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001028
Guido van Rossumd8faa362007-04-27 19:54:29 +00001029 obj = cls()
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001030 obj.name = nts(buf[0:100], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001031 obj.mode = nti(buf[100:108])
1032 obj.uid = nti(buf[108:116])
1033 obj.gid = nti(buf[116:124])
1034 obj.size = nti(buf[124:136])
1035 obj.mtime = nti(buf[136:148])
1036 obj.chksum = chksum
1037 obj.type = buf[156:157]
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001038 obj.linkname = nts(buf[157:257], encoding, errors)
1039 obj.uname = nts(buf[265:297], encoding, errors)
1040 obj.gname = nts(buf[297:329], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001041 obj.devmajor = nti(buf[329:337])
1042 obj.devminor = nti(buf[337:345])
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001043 prefix = nts(buf[345:500], encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001044
Guido van Rossumd8faa362007-04-27 19:54:29 +00001045 # Old V7 tar format represents a directory as a regular
1046 # file with a trailing slash.
1047 if obj.type == AREGTYPE and obj.name.endswith("/"):
1048 obj.type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001049
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001050 # The old GNU sparse format occupies some of the unused
1051 # space in the buffer for up to 4 sparse structures.
1052 # Save the them for later processing in _proc_sparse().
1053 if obj.type == GNUTYPE_SPARSE:
1054 pos = 386
1055 structs = []
1056 for i in range(4):
1057 try:
1058 offset = nti(buf[pos:pos + 12])
1059 numbytes = nti(buf[pos + 12:pos + 24])
1060 except ValueError:
1061 break
1062 structs.append((offset, numbytes))
1063 pos += 24
1064 isextended = bool(buf[482])
1065 origsize = nti(buf[483:495])
1066 obj._sparse_structs = (structs, isextended, origsize)
1067
Guido van Rossumd8faa362007-04-27 19:54:29 +00001068 # Remove redundant slashes from directories.
1069 if obj.isdir():
1070 obj.name = obj.name.rstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001071
Guido van Rossumd8faa362007-04-27 19:54:29 +00001072 # Reconstruct a ustar longname.
1073 if prefix and obj.type not in GNU_TYPES:
1074 obj.name = prefix + "/" + obj.name
1075 return obj
1076
1077 @classmethod
1078 def fromtarfile(cls, tarfile):
1079 """Return the next TarInfo object from TarFile object
1080 tarfile.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001081 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001082 buf = tarfile.fileobj.read(BLOCKSIZE)
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001083 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001084 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1085 return obj._proc_member(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001086
Guido van Rossumd8faa362007-04-27 19:54:29 +00001087 #--------------------------------------------------------------------------
1088 # The following are methods that are called depending on the type of a
1089 # member. The entry point is _proc_member() which can be overridden in a
1090 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1091 # implement the following
1092 # operations:
1093 # 1. Set self.offset_data to the position where the data blocks begin,
1094 # if there is data that follows.
1095 # 2. Set tarfile.offset to the position where the next member's header will
1096 # begin.
1097 # 3. Return self or another valid TarInfo object.
1098 def _proc_member(self, tarfile):
1099 """Choose the right processing method depending on
1100 the type and call it.
Thomas Wouters89f507f2006-12-13 04:49:30 +00001101 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001102 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1103 return self._proc_gnulong(tarfile)
1104 elif self.type == GNUTYPE_SPARSE:
1105 return self._proc_sparse(tarfile)
1106 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1107 return self._proc_pax(tarfile)
1108 else:
1109 return self._proc_builtin(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001110
Guido van Rossumd8faa362007-04-27 19:54:29 +00001111 def _proc_builtin(self, tarfile):
1112 """Process a builtin type or an unknown type which
1113 will be treated as a regular file.
1114 """
1115 self.offset_data = tarfile.fileobj.tell()
1116 offset = self.offset_data
1117 if self.isreg() or self.type not in SUPPORTED_TYPES:
1118 # Skip the following data blocks.
1119 offset += self._block(self.size)
1120 tarfile.offset = offset
Thomas Wouters89f507f2006-12-13 04:49:30 +00001121
Guido van Rossume7ba4952007-06-06 23:52:48 +00001122 # Patch the TarInfo object with saved global
Guido van Rossumd8faa362007-04-27 19:54:29 +00001123 # header information.
Guido van Rossume7ba4952007-06-06 23:52:48 +00001124 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001125
1126 return self
1127
1128 def _proc_gnulong(self, tarfile):
1129 """Process the blocks that hold a GNU longname
1130 or longlink member.
1131 """
1132 buf = tarfile.fileobj.read(self._block(self.size))
1133
1134 # Fetch the next header and process it.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001135 try:
1136 next = self.fromtarfile(tarfile)
1137 except HeaderError:
1138 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001139
1140 # Patch the TarInfo object from the next header with
1141 # the longname information.
1142 next.offset = self.offset
1143 if self.type == GNUTYPE_LONGNAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001144 next.name = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001145 elif self.type == GNUTYPE_LONGLINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001146 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001147
1148 return next
1149
1150 def _proc_sparse(self, tarfile):
1151 """Process a GNU sparse header plus extra headers.
1152 """
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001153 # We already collected some sparse structures in frombuf().
1154 structs, isextended, origsize = self._sparse_structs
1155 del self._sparse_structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001156
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001157 # Collect sparse structures from extended header blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001158 while isextended:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001159 buf = tarfile.fileobj.read(BLOCKSIZE)
1160 pos = 0
Guido van Rossum805365e2007-05-07 22:24:25 +00001161 for i in range(21):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001162 try:
1163 offset = nti(buf[pos:pos + 12])
1164 numbytes = nti(buf[pos + 12:pos + 24])
1165 except ValueError:
1166 break
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001167 if offset and numbytes:
1168 structs.append((offset, numbytes))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001169 pos += 24
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001170 isextended = bool(buf[504])
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001171 self.sparse = structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001172
1173 self.offset_data = tarfile.fileobj.tell()
1174 tarfile.offset = self.offset_data + self._block(self.size)
1175 self.size = origsize
Guido van Rossumd8faa362007-04-27 19:54:29 +00001176 return self
1177
1178 def _proc_pax(self, tarfile):
1179 """Process an extended or global header as described in
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001180 POSIX.1-2008.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001181 """
1182 # Read the header information.
1183 buf = tarfile.fileobj.read(self._block(self.size))
1184
1185 # A pax header stores supplemental information for either
1186 # the following file (extended) or all following files
1187 # (global).
1188 if self.type == XGLTYPE:
1189 pax_headers = tarfile.pax_headers
1190 else:
1191 pax_headers = tarfile.pax_headers.copy()
1192
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001193 # Check if the pax header contains a hdrcharset field. This tells us
1194 # the encoding of the path, linkpath, uname and gname fields. Normally,
1195 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1196 # implementations are allowed to store them as raw binary strings if
1197 # the translation to UTF-8 fails.
1198 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1199 if match is not None:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001200 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001201
1202 # For the time being, we don't care about anything other than "BINARY".
1203 # The only other value that is currently allowed by the standard is
1204 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1205 hdrcharset = pax_headers.get("hdrcharset")
1206 if hdrcharset == "BINARY":
1207 encoding = tarfile.encoding
1208 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001209 encoding = "utf-8"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001210
Guido van Rossumd8faa362007-04-27 19:54:29 +00001211 # Parse pax header information. A record looks like that:
1212 # "%d %s=%s\n" % (length, keyword, value). length is the size
1213 # of the complete record including the length field itself and
Guido van Rossume7ba4952007-06-06 23:52:48 +00001214 # the newline. keyword and value are both UTF-8 encoded strings.
Antoine Pitroufd036452008-08-19 17:56:33 +00001215 regex = re.compile(br"(\d+) ([^=]+)=")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001216 pos = 0
1217 while True:
1218 match = regex.match(buf, pos)
1219 if not match:
1220 break
1221
1222 length, keyword = match.groups()
1223 length = int(length)
1224 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1225
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001226 # Normally, we could just use "utf-8" as the encoding and "strict"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001227 # as the error handler, but we better not take the risk. For
1228 # example, GNU tar <= 1.23 is known to store filenames it cannot
1229 # translate to UTF-8 as raw strings (unfortunately without a
1230 # hdrcharset=BINARY header).
1231 # We first try the strict standard encoding, and if that fails we
1232 # fall back on the user's encoding and error handler.
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001233 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001234 tarfile.errors)
1235 if keyword in PAX_NAME_FIELDS:
1236 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1237 tarfile.errors)
1238 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001239 value = self._decode_pax_field(value, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001240 tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001241
1242 pax_headers[keyword] = value
1243 pos += length
1244
Guido van Rossume7ba4952007-06-06 23:52:48 +00001245 # Fetch the next header.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001246 try:
1247 next = self.fromtarfile(tarfile)
1248 except HeaderError:
1249 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001250
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001251 # Process GNU sparse information.
1252 if "GNU.sparse.map" in pax_headers:
1253 # GNU extended sparse format version 0.1.
1254 self._proc_gnusparse_01(next, pax_headers)
1255
1256 elif "GNU.sparse.size" in pax_headers:
1257 # GNU extended sparse format version 0.0.
1258 self._proc_gnusparse_00(next, pax_headers, buf)
1259
1260 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1261 # GNU extended sparse format version 1.0.
1262 self._proc_gnusparse_10(next, pax_headers, tarfile)
1263
Guido van Rossume7ba4952007-06-06 23:52:48 +00001264 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001265 # Patch the TarInfo object with the extended header info.
1266 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1267 next.offset = self.offset
1268
1269 if "size" in pax_headers:
1270 # If the extended header replaces the size field,
1271 # we need to recalculate the offset where the next
1272 # header starts.
1273 offset = next.offset_data
1274 if next.isreg() or next.type not in SUPPORTED_TYPES:
1275 offset += next._block(next.size)
1276 tarfile.offset = offset
1277
1278 return next
1279
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001280 def _proc_gnusparse_00(self, next, pax_headers, buf):
1281 """Process a GNU tar extended sparse header, version 0.0.
1282 """
1283 offsets = []
1284 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1285 offsets.append(int(match.group(1)))
1286 numbytes = []
1287 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1288 numbytes.append(int(match.group(1)))
1289 next.sparse = list(zip(offsets, numbytes))
1290
1291 def _proc_gnusparse_01(self, next, pax_headers):
1292 """Process a GNU tar extended sparse header, version 0.1.
1293 """
1294 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1295 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1296
1297 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1298 """Process a GNU tar extended sparse header, version 1.0.
1299 """
1300 fields = None
1301 sparse = []
1302 buf = tarfile.fileobj.read(BLOCKSIZE)
1303 fields, buf = buf.split(b"\n", 1)
1304 fields = int(fields)
1305 while len(sparse) < fields * 2:
1306 if b"\n" not in buf:
1307 buf += tarfile.fileobj.read(BLOCKSIZE)
1308 number, buf = buf.split(b"\n", 1)
1309 sparse.append(int(number))
1310 next.offset_data = tarfile.fileobj.tell()
1311 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1312
Guido van Rossume7ba4952007-06-06 23:52:48 +00001313 def _apply_pax_info(self, pax_headers, encoding, errors):
1314 """Replace fields with supplemental information from a previous
1315 pax extended or global header.
1316 """
1317 for keyword, value in pax_headers.items():
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001318 if keyword == "GNU.sparse.name":
1319 setattr(self, "path", value)
1320 elif keyword == "GNU.sparse.size":
1321 setattr(self, "size", int(value))
1322 elif keyword == "GNU.sparse.realsize":
1323 setattr(self, "size", int(value))
1324 elif keyword in PAX_FIELDS:
1325 if keyword in PAX_NUMBER_FIELDS:
1326 try:
1327 value = PAX_NUMBER_FIELDS[keyword](value)
1328 except ValueError:
1329 value = 0
1330 if keyword == "path":
1331 value = value.rstrip("/")
1332 setattr(self, keyword, value)
Guido van Rossume7ba4952007-06-06 23:52:48 +00001333
1334 self.pax_headers = pax_headers.copy()
Guido van Rossumd8faa362007-04-27 19:54:29 +00001335
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001336 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1337 """Decode a single field from a pax record.
1338 """
1339 try:
1340 return value.decode(encoding, "strict")
1341 except UnicodeDecodeError:
1342 return value.decode(fallback_encoding, fallback_errors)
1343
Guido van Rossumd8faa362007-04-27 19:54:29 +00001344 def _block(self, count):
1345 """Round up a byte count by BLOCKSIZE and return it,
1346 e.g. _block(834) => 1024.
1347 """
1348 blocks, remainder = divmod(count, BLOCKSIZE)
1349 if remainder:
1350 blocks += 1
1351 return blocks * BLOCKSIZE
Thomas Wouters89f507f2006-12-13 04:49:30 +00001352
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001353 def isreg(self):
1354 return self.type in REGULAR_TYPES
1355 def isfile(self):
1356 return self.isreg()
1357 def isdir(self):
1358 return self.type == DIRTYPE
1359 def issym(self):
1360 return self.type == SYMTYPE
1361 def islnk(self):
1362 return self.type == LNKTYPE
1363 def ischr(self):
1364 return self.type == CHRTYPE
1365 def isblk(self):
1366 return self.type == BLKTYPE
1367 def isfifo(self):
1368 return self.type == FIFOTYPE
1369 def issparse(self):
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001370 return self.sparse is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001371 def isdev(self):
1372 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1373# class TarInfo
1374
1375class TarFile(object):
1376 """The TarFile Class provides an interface to tar archives.
1377 """
1378
1379 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1380
1381 dereference = False # If true, add content of linked file to the
1382 # tar file, else the link.
1383
1384 ignore_zeros = False # If true, skips empty or invalid blocks and
1385 # continues processing.
1386
Lars Gustäbel365aff32009-12-13 11:42:29 +00001387 errorlevel = 1 # If 0, fatal errors only appear in debug
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001388 # messages (if debug >= 0). If > 0, errors
1389 # are passed to the caller as exceptions.
1390
Guido van Rossumd8faa362007-04-27 19:54:29 +00001391 format = DEFAULT_FORMAT # The format to use when creating an archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001392
Guido van Rossume7ba4952007-06-06 23:52:48 +00001393 encoding = ENCODING # Encoding for 8-bit character strings.
1394
1395 errors = None # Error handler for unicode conversion.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001396
Guido van Rossumd8faa362007-04-27 19:54:29 +00001397 tarinfo = TarInfo # The default TarInfo class to use.
1398
Lars Gustäbelb062a2f2012-05-14 13:18:16 +02001399 fileobject = ExFileObject # The file-object for extractfile().
Guido van Rossumd8faa362007-04-27 19:54:29 +00001400
1401 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1402 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
Victor Stinnerde629d42010-05-05 21:43:57 +00001403 errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001404 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1405 read from an existing archive, 'a' to append data to an existing
1406 file or 'w' to create a new file overwriting an existing one. `mode'
1407 defaults to 'r'.
1408 If `fileobj' is given, it is used for reading or writing data. If it
1409 can be determined, `mode' is overridden by `fileobj's mode.
1410 `fileobj' is not closed, when TarFile is closed.
1411 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001412 modes = {"r": "rb", "a": "r+b", "w": "wb", "x": "xb"}
Serhiy Storchaka53ad0cd2014-01-18 15:35:37 +02001413 if mode not in modes:
Berker Peksag0fe63252015-02-13 21:02:12 +02001414 raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001415 self.mode = mode
Serhiy Storchaka53ad0cd2014-01-18 15:35:37 +02001416 self._mode = modes[mode]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001417
1418 if not fileobj:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001419 if self.mode == "a" and not os.path.exists(name):
Thomas Wouterscf297e42007-02-23 15:07:44 +00001420 # Create nonexistent files in append mode.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001421 self.mode = "w"
1422 self._mode = "wb"
Guido van Rossume7ba4952007-06-06 23:52:48 +00001423 fileobj = bltn_open(name, self._mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001424 self._extfileobj = False
1425 else:
Serhiy Storchaka2c6a3ae2014-07-16 23:58:58 +03001426 if (name is None and hasattr(fileobj, "name") and
1427 isinstance(fileobj.name, (str, bytes))):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001428 name = fileobj.name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001429 if hasattr(fileobj, "mode"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001430 self._mode = fileobj.mode
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001431 self._extfileobj = True
Thomas Woutersed03b412007-08-28 21:37:11 +00001432 self.name = os.path.abspath(name) if name else None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001433 self.fileobj = fileobj
1434
Guido van Rossumd8faa362007-04-27 19:54:29 +00001435 # Init attributes.
1436 if format is not None:
1437 self.format = format
1438 if tarinfo is not None:
1439 self.tarinfo = tarinfo
1440 if dereference is not None:
1441 self.dereference = dereference
1442 if ignore_zeros is not None:
1443 self.ignore_zeros = ignore_zeros
1444 if encoding is not None:
1445 self.encoding = encoding
Victor Stinnerde629d42010-05-05 21:43:57 +00001446 self.errors = errors
Guido van Rossume7ba4952007-06-06 23:52:48 +00001447
1448 if pax_headers is not None and self.format == PAX_FORMAT:
1449 self.pax_headers = pax_headers
1450 else:
1451 self.pax_headers = {}
1452
Guido van Rossumd8faa362007-04-27 19:54:29 +00001453 if debug is not None:
1454 self.debug = debug
1455 if errorlevel is not None:
1456 self.errorlevel = errorlevel
1457
1458 # Init datastructures.
Thomas Wouters477c8d52006-05-27 19:21:47 +00001459 self.closed = False
1460 self.members = [] # list of members as TarInfo objects
1461 self._loaded = False # flag if all members have been read
Christian Heimesd8654cf2007-12-02 15:22:16 +00001462 self.offset = self.fileobj.tell()
1463 # current position in the archive file
Thomas Wouters477c8d52006-05-27 19:21:47 +00001464 self.inodes = {} # dictionary caching the inodes of
1465 # archive members already added
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001466
Lars Gustäbel7b465392009-11-18 20:29:25 +00001467 try:
1468 if self.mode == "r":
1469 self.firstmember = None
1470 self.firstmember = self.next()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001471
Lars Gustäbel7b465392009-11-18 20:29:25 +00001472 if self.mode == "a":
1473 # Move to the end of the archive,
1474 # before the first empty block.
Lars Gustäbel7b465392009-11-18 20:29:25 +00001475 while True:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001476 self.fileobj.seek(self.offset)
1477 try:
1478 tarinfo = self.tarinfo.fromtarfile(self)
1479 self.members.append(tarinfo)
1480 except EOFHeaderError:
1481 self.fileobj.seek(self.offset)
Lars Gustäbel7b465392009-11-18 20:29:25 +00001482 break
Lars Gustäbel9520a432009-11-22 18:48:49 +00001483 except HeaderError as e:
1484 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001485
Lars Gustäbel20703c62015-05-27 12:53:44 +02001486 if self.mode in ("a", "w", "x"):
Lars Gustäbel7b465392009-11-18 20:29:25 +00001487 self._loaded = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001488
Lars Gustäbel7b465392009-11-18 20:29:25 +00001489 if self.pax_headers:
1490 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1491 self.fileobj.write(buf)
1492 self.offset += len(buf)
1493 except:
1494 if not self._extfileobj:
1495 self.fileobj.close()
1496 self.closed = True
1497 raise
Guido van Rossumd8faa362007-04-27 19:54:29 +00001498
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001499 #--------------------------------------------------------------------------
1500 # Below are the classmethods which act as alternate constructors to the
1501 # TarFile class. The open() method is the only one that is needed for
1502 # public use; it is the "super"-constructor and is able to select an
1503 # adequate "sub"-constructor for a particular compression using the mapping
1504 # from OPEN_METH.
1505 #
1506 # This concept allows one to subclass TarFile without losing the comfort of
1507 # the super-constructor. A sub-constructor is registered and made available
1508 # by adding it to the mapping in OPEN_METH.
1509
Guido van Rossum75b64e62005-01-16 00:16:11 +00001510 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001511 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001512 """Open a tar archive for reading, writing or appending. Return
1513 an appropriate TarFile class.
1514
1515 mode:
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001516 'r' or 'r:*' open for reading with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001517 'r:' open for reading exclusively uncompressed
1518 'r:gz' open for reading with gzip compression
1519 'r:bz2' open for reading with bzip2 compression
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001520 'r:xz' open for reading with lzma compression
Thomas Wouterscf297e42007-02-23 15:07:44 +00001521 'a' or 'a:' open for appending, creating the file if necessary
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001522 'w' or 'w:' open for writing without compression
1523 'w:gz' open for writing with gzip compression
1524 'w:bz2' open for writing with bzip2 compression
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001525 'w:xz' open for writing with lzma compression
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001526
Berker Peksag0fe63252015-02-13 21:02:12 +02001527 'x' or 'x:' create a tarfile exclusively without compression, raise
1528 an exception if the file is already created
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +03001529 'x:gz' create a gzip compressed tarfile, raise an exception
Berker Peksag0fe63252015-02-13 21:02:12 +02001530 if the file is already created
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +03001531 'x:bz2' create a bzip2 compressed tarfile, raise an exception
Berker Peksag0fe63252015-02-13 21:02:12 +02001532 if the file is already created
1533 'x:xz' create an lzma compressed tarfile, raise an exception
1534 if the file is already created
1535
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001536 'r|*' open a stream of tar blocks with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001537 'r|' open an uncompressed stream of tar blocks for reading
1538 'r|gz' open a gzip compressed stream of tar blocks
1539 'r|bz2' open a bzip2 compressed stream of tar blocks
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001540 'r|xz' open an lzma compressed stream of tar blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001541 'w|' open an uncompressed stream for writing
1542 'w|gz' open a gzip compressed stream for writing
1543 'w|bz2' open a bzip2 compressed stream for writing
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001544 'w|xz' open an lzma compressed stream for writing
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001545 """
1546
1547 if not name and not fileobj:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001548 raise ValueError("nothing to open")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001549
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001550 if mode in ("r", "r:*"):
1551 # Find out which *open() is appropriate for opening the file.
Serhiy Storchakaa89d22a2016-10-30 20:52:29 +02001552 def not_compressed(comptype):
1553 return cls.OPEN_METH[comptype] == 'taropen'
1554 for comptype in sorted(cls.OPEN_METH, key=not_compressed):
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001555 func = getattr(cls, cls.OPEN_METH[comptype])
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001556 if fileobj is not None:
1557 saved_pos = fileobj.tell()
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001558 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001559 return func(name, "r", fileobj, **kwargs)
1560 except (ReadError, CompressionError) as e:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001561 if fileobj is not None:
1562 fileobj.seek(saved_pos)
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001563 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001564 raise ReadError("file could not be opened successfully")
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001565
1566 elif ":" in mode:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001567 filemode, comptype = mode.split(":", 1)
1568 filemode = filemode or "r"
1569 comptype = comptype or "tar"
1570
1571 # Select the *open() function according to
1572 # given compression.
1573 if comptype in cls.OPEN_METH:
1574 func = getattr(cls, cls.OPEN_METH[comptype])
1575 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001576 raise CompressionError("unknown compression type %r" % comptype)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001577 return func(name, filemode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001578
1579 elif "|" in mode:
1580 filemode, comptype = mode.split("|", 1)
1581 filemode = filemode or "r"
1582 comptype = comptype or "tar"
1583
Serhiy Storchaka53ad0cd2014-01-18 15:35:37 +02001584 if filemode not in ("r", "w"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001585 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001586
Antoine Pitrou605c2932010-09-23 20:15:14 +00001587 stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1588 try:
1589 t = cls(name, filemode, stream, **kwargs)
1590 except:
1591 stream.close()
1592 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001593 t._extfileobj = False
1594 return t
1595
Berker Peksag0fe63252015-02-13 21:02:12 +02001596 elif mode in ("a", "w", "x"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001597 return cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001598
Thomas Wouters477c8d52006-05-27 19:21:47 +00001599 raise ValueError("undiscernible mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001600
Guido van Rossum75b64e62005-01-16 00:16:11 +00001601 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001602 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001603 """Open uncompressed tar archive name for reading or writing.
1604 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001605 if mode not in ("r", "a", "w", "x"):
1606 raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001607 return cls(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001608
Guido van Rossum75b64e62005-01-16 00:16:11 +00001609 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001610 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001611 """Open gzip compressed tar archive name for reading or writing.
1612 Appending is not allowed.
1613 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001614 if mode not in ("r", "w", "x"):
1615 raise ValueError("mode must be 'r', 'w' or 'x'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001616
1617 try:
1618 import gzip
Neal Norwitz4ec68242003-04-11 03:05:56 +00001619 gzip.GzipFile
1620 except (ImportError, AttributeError):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001621 raise CompressionError("gzip module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001622
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001623 try:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001624 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001625 except OSError:
1626 if fileobj is not None and mode == 'r':
1627 raise ReadError("not a gzip file")
1628 raise
1629
1630 try:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001631 t = cls.taropen(name, mode, fileobj, **kwargs)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001632 except OSError:
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001633 fileobj.close()
1634 if mode == 'r':
1635 raise ReadError("not a gzip file")
1636 raise
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001637 except:
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001638 fileobj.close()
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001639 raise
Serhiy Storchaka9fbec7a2014-01-18 15:53:05 +02001640 t._extfileobj = False
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001641 return t
1642
Guido van Rossum75b64e62005-01-16 00:16:11 +00001643 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001644 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001645 """Open bzip2 compressed tar archive name for reading or writing.
1646 Appending is not allowed.
1647 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001648 if mode not in ("r", "w", "x"):
1649 raise ValueError("mode must be 'r', 'w' or 'x'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001650
1651 try:
1652 import bz2
Brett Cannoncd171c82013-07-04 17:43:24 -04001653 except ImportError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001654 raise CompressionError("bz2 module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001655
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +02001656 fileobj = bz2.BZ2File(fileobj or name, mode,
1657 compresslevel=compresslevel)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001658
1659 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001660 t = cls.taropen(name, mode, fileobj, **kwargs)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001661 except (OSError, EOFError):
Antoine Pitrou95f55602010-09-23 18:36:46 +00001662 fileobj.close()
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001663 if mode == 'r':
1664 raise ReadError("not a bzip2 file")
1665 raise
Serhiy Storchakae413cde2014-01-18 16:28:08 +02001666 except:
1667 fileobj.close()
1668 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001669 t._extfileobj = False
1670 return t
1671
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001672 @classmethod
Lars Gustäbelc5e11992012-01-18 14:01:17 +01001673 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001674 """Open lzma compressed tar archive name for reading or writing.
1675 Appending is not allowed.
1676 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001677 if mode not in ("r", "w", "x"):
1678 raise ValueError("mode must be 'r', 'w' or 'x'")
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001679
1680 try:
1681 import lzma
Brett Cannoncd171c82013-07-04 17:43:24 -04001682 except ImportError:
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001683 raise CompressionError("lzma module is not available")
1684
Nadeem Vawda33c34da2012-06-04 23:34:07 +02001685 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001686
1687 try:
1688 t = cls.taropen(name, mode, fileobj, **kwargs)
1689 except (lzma.LZMAError, EOFError):
1690 fileobj.close()
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001691 if mode == 'r':
1692 raise ReadError("not an lzma file")
1693 raise
Serhiy Storchakae413cde2014-01-18 16:28:08 +02001694 except:
1695 fileobj.close()
1696 raise
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001697 t._extfileobj = False
1698 return t
1699
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001700 # All *open() methods are registered here.
1701 OPEN_METH = {
1702 "tar": "taropen", # uncompressed tar
1703 "gz": "gzopen", # gzip compressed tar
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001704 "bz2": "bz2open", # bzip2 compressed tar
1705 "xz": "xzopen" # lzma compressed tar
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001706 }
1707
1708 #--------------------------------------------------------------------------
1709 # The public methods which TarFile provides:
1710
1711 def close(self):
1712 """Close the TarFile. In write-mode, two finishing zero blocks are
1713 appended to the archive.
1714 """
1715 if self.closed:
1716 return
1717
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001718 self.closed = True
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +03001719 try:
Lars Gustäbel20703c62015-05-27 12:53:44 +02001720 if self.mode in ("a", "w", "x"):
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +03001721 self.fileobj.write(NUL * (BLOCKSIZE * 2))
1722 self.offset += (BLOCKSIZE * 2)
1723 # fill up the end with zero-blocks
1724 # (like option -b20 for tar does)
1725 blocks, remainder = divmod(self.offset, RECORDSIZE)
1726 if remainder > 0:
1727 self.fileobj.write(NUL * (RECORDSIZE - remainder))
1728 finally:
1729 if not self._extfileobj:
1730 self.fileobj.close()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001731
1732 def getmember(self, name):
1733 """Return a TarInfo object for member `name'. If `name' can not be
1734 found in the archive, KeyError is raised. If a member occurs more
Mark Dickinson934896d2009-02-21 20:59:32 +00001735 than once in the archive, its last occurrence is assumed to be the
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001736 most up-to-date version.
1737 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001738 tarinfo = self._getmember(name)
1739 if tarinfo is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001740 raise KeyError("filename %r not found" % name)
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001741 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001742
1743 def getmembers(self):
1744 """Return the members of the archive as a list of TarInfo objects. The
1745 list has the same order as the members in the archive.
1746 """
1747 self._check()
1748 if not self._loaded: # if we want to obtain a list of
1749 self._load() # all members, we first have to
1750 # scan the whole archive.
1751 return self.members
1752
1753 def getnames(self):
1754 """Return the members of the archive as a list of their names. It has
1755 the same order as the list returned by getmembers().
1756 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001757 return [tarinfo.name for tarinfo in self.getmembers()]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001758
1759 def gettarinfo(self, name=None, arcname=None, fileobj=None):
Martin Panterf817a482016-02-19 23:34:56 +00001760 """Create a TarInfo object from the result of os.stat or equivalent
1761 on an existing file. The file is either named by `name', or
1762 specified as a file object `fileobj' with a file descriptor. If
1763 given, `arcname' specifies an alternative name for the file in the
1764 archive, otherwise, the name is taken from the 'name' attribute of
1765 'fileobj', or the 'name' argument. The name should be a text
1766 string.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001767 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001768 self._check("awx")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001769
1770 # When fileobj is given, replace name by
1771 # fileobj's real name.
1772 if fileobj is not None:
1773 name = fileobj.name
1774
1775 # Building the name of the member in the archive.
1776 # Backward slashes are converted to forward slashes,
1777 # Absolute paths are turned to relative paths.
1778 if arcname is None:
1779 arcname = name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001780 drv, arcname = os.path.splitdrive(arcname)
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00001781 arcname = arcname.replace(os.sep, "/")
1782 arcname = arcname.lstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001783
1784 # Now, fill the TarInfo object with
1785 # information specific for the file.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001786 tarinfo = self.tarinfo()
Martin Panterf817a482016-02-19 23:34:56 +00001787 tarinfo.tarfile = self # Not needed
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001788
1789 # Use os.stat or os.lstat, depending on platform
1790 # and if symlinks shall be resolved.
1791 if fileobj is None:
1792 if hasattr(os, "lstat") and not self.dereference:
1793 statres = os.lstat(name)
1794 else:
1795 statres = os.stat(name)
1796 else:
1797 statres = os.fstat(fileobj.fileno())
1798 linkname = ""
1799
1800 stmd = statres.st_mode
1801 if stat.S_ISREG(stmd):
1802 inode = (statres.st_ino, statres.st_dev)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001803 if not self.dereference and statres.st_nlink > 1 and \
1804 inode in self.inodes and arcname != self.inodes[inode]:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001805 # Is it a hardlink to an already
1806 # archived file?
1807 type = LNKTYPE
1808 linkname = self.inodes[inode]
1809 else:
1810 # The inode is added only if its valid.
1811 # For win32 it is always 0.
1812 type = REGTYPE
1813 if inode[0]:
1814 self.inodes[inode] = arcname
1815 elif stat.S_ISDIR(stmd):
1816 type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001817 elif stat.S_ISFIFO(stmd):
1818 type = FIFOTYPE
1819 elif stat.S_ISLNK(stmd):
1820 type = SYMTYPE
1821 linkname = os.readlink(name)
1822 elif stat.S_ISCHR(stmd):
1823 type = CHRTYPE
1824 elif stat.S_ISBLK(stmd):
1825 type = BLKTYPE
1826 else:
1827 return None
1828
1829 # Fill the TarInfo object with all
1830 # information we can get.
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001831 tarinfo.name = arcname
1832 tarinfo.mode = stmd
1833 tarinfo.uid = statres.st_uid
1834 tarinfo.gid = statres.st_gid
Lars Gustäbel2470ff12010-06-03 10:11:52 +00001835 if type == REGTYPE:
Martin v. Löwis61d77e02004-08-20 06:35:46 +00001836 tarinfo.size = statres.st_size
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001837 else:
Guido van Rossume2a383d2007-01-15 16:59:06 +00001838 tarinfo.size = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001839 tarinfo.mtime = statres.st_mtime
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001840 tarinfo.type = type
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001841 tarinfo.linkname = linkname
1842 if pwd:
1843 try:
1844 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1845 except KeyError:
1846 pass
1847 if grp:
1848 try:
1849 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1850 except KeyError:
1851 pass
1852
1853 if type in (CHRTYPE, BLKTYPE):
1854 if hasattr(os, "major") and hasattr(os, "minor"):
1855 tarinfo.devmajor = os.major(statres.st_rdev)
1856 tarinfo.devminor = os.minor(statres.st_rdev)
1857 return tarinfo
1858
Serhiy Storchakaa7eb7462014-08-21 10:01:16 +03001859 def list(self, verbose=True, *, members=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001860 """Print a table of contents to sys.stdout. If `verbose' is False, only
1861 the names of the members are printed. If it is True, an `ls -l'-like
Serhiy Storchakaa7eb7462014-08-21 10:01:16 +03001862 output is produced. `members' is optional and must be a subset of the
1863 list returned by getmembers().
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001864 """
1865 self._check()
1866
Serhiy Storchakaa7eb7462014-08-21 10:01:16 +03001867 if members is None:
1868 members = self
1869 for tarinfo in members:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001870 if verbose:
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001871 _safe_print(stat.filemode(tarinfo.mode))
1872 _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid,
1873 tarinfo.gname or tarinfo.gid))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001874 if tarinfo.ischr() or tarinfo.isblk():
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001875 _safe_print("%10s" %
1876 ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor)))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001877 else:
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001878 _safe_print("%10d" % tarinfo.size)
1879 _safe_print("%d-%02d-%02d %02d:%02d:%02d" \
1880 % time.localtime(tarinfo.mtime)[:6])
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001881
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001882 _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else ""))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001883
1884 if verbose:
1885 if tarinfo.issym():
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001886 _safe_print("-> " + tarinfo.linkname)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001887 if tarinfo.islnk():
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001888 _safe_print("link to " + tarinfo.linkname)
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001889 print()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001890
Raymond Hettingera63a3122011-01-26 20:34:14 +00001891 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001892 """Add the file `name' to the archive. `name' may be any type of file
1893 (directory, fifo, symbolic link, etc.). If given, `arcname'
1894 specifies an alternative name for the file in the archive.
1895 Directories are added recursively by default. This can be avoided by
Guido van Rossum486364b2007-06-30 05:01:58 +00001896 setting `recursive' to False. `exclude' is a function that should
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001897 return True for each filename to be excluded. `filter' is a function
1898 that expects a TarInfo object argument and returns the changed
1899 TarInfo object, if it returns None the TarInfo object will be
1900 excluded from the archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001901 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001902 self._check("awx")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001903
1904 if arcname is None:
1905 arcname = name
1906
Guido van Rossum486364b2007-06-30 05:01:58 +00001907 # Exclude pathnames.
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001908 if exclude is not None:
1909 import warnings
1910 warnings.warn("use the filter argument instead",
1911 DeprecationWarning, 2)
1912 if exclude(name):
1913 self._dbg(2, "tarfile: Excluded %r" % name)
1914 return
Guido van Rossum486364b2007-06-30 05:01:58 +00001915
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001916 # Skip if somebody tries to archive the archive...
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001917 if self.name is not None and os.path.abspath(name) == self.name:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001918 self._dbg(2, "tarfile: Skipped %r" % name)
1919 return
1920
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001921 self._dbg(1, name)
1922
1923 # Create a TarInfo object from the file.
1924 tarinfo = self.gettarinfo(name, arcname)
1925
1926 if tarinfo is None:
1927 self._dbg(1, "tarfile: Unsupported type %r" % name)
1928 return
1929
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001930 # Change or exclude the TarInfo object.
1931 if filter is not None:
1932 tarinfo = filter(tarinfo)
1933 if tarinfo is None:
1934 self._dbg(2, "tarfile: Excluded %r" % name)
1935 return
1936
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001937 # Append the tar header and data to the archive.
1938 if tarinfo.isreg():
Andrew Svetlov718df1d2012-11-29 14:20:47 +02001939 with bltn_open(name, "rb") as f:
1940 self.addfile(tarinfo, f)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001941
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001942 elif tarinfo.isdir():
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001943 self.addfile(tarinfo)
1944 if recursive:
1945 for f in os.listdir(name):
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001946 self.add(os.path.join(name, f), os.path.join(arcname, f),
Raymond Hettingera63a3122011-01-26 20:34:14 +00001947 recursive, exclude, filter=filter)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001948
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001949 else:
1950 self.addfile(tarinfo)
1951
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001952 def addfile(self, tarinfo, fileobj=None):
1953 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
Martin Panterf817a482016-02-19 23:34:56 +00001954 given, it should be a binary file, and tarinfo.size bytes are read
1955 from it and added to the archive. You can create TarInfo objects
1956 directly, or by using gettarinfo().
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001957 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001958 self._check("awx")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001959
Thomas Wouters89f507f2006-12-13 04:49:30 +00001960 tarinfo = copy.copy(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001961
Guido van Rossume7ba4952007-06-06 23:52:48 +00001962 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001963 self.fileobj.write(buf)
1964 self.offset += len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001965
1966 # If there's data to follow, append it.
1967 if fileobj is not None:
1968 copyfileobj(fileobj, self.fileobj, tarinfo.size)
1969 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
1970 if remainder > 0:
1971 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
1972 blocks += 1
1973 self.offset += blocks * BLOCKSIZE
1974
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001975 self.members.append(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001976
Eric V. Smith7a803892015-04-15 10:27:58 -04001977 def extractall(self, path=".", members=None, *, numeric_owner=False):
Martin v. Löwis00a73e72005-03-04 19:40:34 +00001978 """Extract all members from the archive to the current working
1979 directory and set owner, modification time and permissions on
1980 directories afterwards. `path' specifies a different directory
1981 to extract to. `members' is optional and must be a subset of the
Eric V. Smith7a803892015-04-15 10:27:58 -04001982 list returned by getmembers(). If `numeric_owner` is True, only
1983 the numbers for user/group names are used and not the names.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00001984 """
1985 directories = []
1986
1987 if members is None:
1988 members = self
1989
1990 for tarinfo in members:
1991 if tarinfo.isdir():
Christian Heimes2202f872008-02-06 14:31:34 +00001992 # Extract directories with a safe mode.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00001993 directories.append(tarinfo)
Christian Heimes2202f872008-02-06 14:31:34 +00001994 tarinfo = copy.copy(tarinfo)
1995 tarinfo.mode = 0o700
Martin v. Löwis16f344d2010-11-01 21:39:13 +00001996 # Do not set_attrs directories, as we will do that further down
Eric V. Smith7a803892015-04-15 10:27:58 -04001997 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(),
1998 numeric_owner=numeric_owner)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00001999
2000 # Reverse sort directories.
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +00002001 directories.sort(key=lambda a: a.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002002 directories.reverse()
2003
2004 # Set correct owner, mtime and filemode on directories.
2005 for tarinfo in directories:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002006 dirpath = os.path.join(path, tarinfo.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002007 try:
Eric V. Smith7a803892015-04-15 10:27:58 -04002008 self.chown(tarinfo, dirpath, numeric_owner=numeric_owner)
Christian Heimesfaf2f632008-01-06 16:59:19 +00002009 self.utime(tarinfo, dirpath)
2010 self.chmod(tarinfo, dirpath)
Guido van Rossumb940e112007-01-10 16:19:56 +00002011 except ExtractError as e:
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002012 if self.errorlevel > 1:
2013 raise
2014 else:
2015 self._dbg(1, "tarfile: %s" % e)
2016
Eric V. Smith7a803892015-04-15 10:27:58 -04002017 def extract(self, member, path="", set_attrs=True, *, numeric_owner=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002018 """Extract a member from the archive to the current working directory,
2019 using its full name. Its file information is extracted as accurately
2020 as possible. `member' may be a filename or a TarInfo object. You can
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002021 specify a different directory using `path'. File attributes (owner,
Eric V. Smith7a803892015-04-15 10:27:58 -04002022 mtime, mode) are set unless `set_attrs' is False. If `numeric_owner`
2023 is True, only the numbers for user/group names are used and not
2024 the names.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002025 """
2026 self._check("r")
2027
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002028 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002029 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002030 else:
2031 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002032
Neal Norwitza4f651a2004-07-20 22:07:44 +00002033 # Prepare the link target for makelink().
2034 if tarinfo.islnk():
2035 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2036
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002037 try:
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002038 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
Eric V. Smith7a803892015-04-15 10:27:58 -04002039 set_attrs=set_attrs,
2040 numeric_owner=numeric_owner)
Andrew Svetlov3438fa42012-12-17 23:35:18 +02002041 except OSError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002042 if self.errorlevel > 0:
2043 raise
2044 else:
2045 if e.filename is None:
2046 self._dbg(1, "tarfile: %s" % e.strerror)
2047 else:
2048 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
Guido van Rossumb940e112007-01-10 16:19:56 +00002049 except ExtractError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002050 if self.errorlevel > 1:
2051 raise
2052 else:
2053 self._dbg(1, "tarfile: %s" % e)
2054
2055 def extractfile(self, member):
2056 """Extract a member from the archive as a file object. `member' may be
Lars Gustäbel7a919e92012-05-05 18:15:03 +02002057 a filename or a TarInfo object. If `member' is a regular file or a
2058 link, an io.BufferedReader object is returned. Otherwise, None is
2059 returned.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002060 """
2061 self._check("r")
2062
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002063 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002064 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002065 else:
2066 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002067
Lars Gustäbel7a919e92012-05-05 18:15:03 +02002068 if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES:
2069 # Members with unknown types are treated as regular files.
Lars Gustäbelb062a2f2012-05-14 13:18:16 +02002070 return self.fileobject(self, tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002071
2072 elif tarinfo.islnk() or tarinfo.issym():
2073 if isinstance(self.fileobj, _Stream):
2074 # A small but ugly workaround for the case that someone tries
2075 # to extract a (sym)link as a file-object from a non-seekable
2076 # stream of tar blocks.
Thomas Wouters477c8d52006-05-27 19:21:47 +00002077 raise StreamError("cannot extract (sym)link as file object")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002078 else:
Georg Brandl7eb4b7d2005-07-22 21:49:32 +00002079 # A (sym)link's file object is its target's file object.
Lars Gustäbel1b512722010-06-03 12:45:16 +00002080 return self.extractfile(self._find_link_target(tarinfo))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002081 else:
2082 # If there's no data associated with the member (directory, chrdev,
2083 # blkdev, etc.), return None instead of a file object.
2084 return None
2085
Eric V. Smith7a803892015-04-15 10:27:58 -04002086 def _extract_member(self, tarinfo, targetpath, set_attrs=True,
2087 numeric_owner=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002088 """Extract the TarInfo object tarinfo to a physical
2089 file called targetpath.
2090 """
2091 # Fetch the TarInfo object for the given name
2092 # and build the destination pathname, replacing
2093 # forward slashes to platform specific separators.
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002094 targetpath = targetpath.rstrip("/")
2095 targetpath = targetpath.replace("/", os.sep)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002096
2097 # Create all upper directories.
2098 upperdirs = os.path.dirname(targetpath)
2099 if upperdirs and not os.path.exists(upperdirs):
Christian Heimes2202f872008-02-06 14:31:34 +00002100 # Create directories that are not part of the archive with
2101 # default permissions.
Thomas Woutersb2137042007-02-01 18:02:27 +00002102 os.makedirs(upperdirs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002103
2104 if tarinfo.islnk() or tarinfo.issym():
2105 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2106 else:
2107 self._dbg(1, tarinfo.name)
2108
2109 if tarinfo.isreg():
2110 self.makefile(tarinfo, targetpath)
2111 elif tarinfo.isdir():
2112 self.makedir(tarinfo, targetpath)
2113 elif tarinfo.isfifo():
2114 self.makefifo(tarinfo, targetpath)
2115 elif tarinfo.ischr() or tarinfo.isblk():
2116 self.makedev(tarinfo, targetpath)
2117 elif tarinfo.islnk() or tarinfo.issym():
2118 self.makelink(tarinfo, targetpath)
2119 elif tarinfo.type not in SUPPORTED_TYPES:
2120 self.makeunknown(tarinfo, targetpath)
2121 else:
2122 self.makefile(tarinfo, targetpath)
2123
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002124 if set_attrs:
Eric V. Smith7a803892015-04-15 10:27:58 -04002125 self.chown(tarinfo, targetpath, numeric_owner)
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002126 if not tarinfo.issym():
2127 self.chmod(tarinfo, targetpath)
2128 self.utime(tarinfo, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002129
2130 #--------------------------------------------------------------------------
2131 # Below are the different file methods. They are called via
2132 # _extract_member() when extract() is called. They can be replaced in a
2133 # subclass to implement other functionality.
2134
2135 def makedir(self, tarinfo, targetpath):
2136 """Make a directory called targetpath.
2137 """
2138 try:
Christian Heimes2202f872008-02-06 14:31:34 +00002139 # Use a safe mode for the directory, the real mode is set
2140 # later in _extract_member().
2141 os.mkdir(targetpath, 0o700)
Florent Xicluna68f71a32011-10-28 16:06:23 +02002142 except FileExistsError:
2143 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002144
2145 def makefile(self, tarinfo, targetpath):
2146 """Make a file called targetpath.
2147 """
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002148 source = self.fileobj
2149 source.seek(tarinfo.offset_data)
Andrew Svetlov718df1d2012-11-29 14:20:47 +02002150 with bltn_open(targetpath, "wb") as target:
2151 if tarinfo.sparse is not None:
2152 for offset, size in tarinfo.sparse:
2153 target.seek(offset)
Lars Gustäbel03572682015-07-06 09:27:24 +02002154 copyfileobj(source, target, size, ReadError)
Łukasz Langae7f27482016-06-11 16:42:36 -07002155 target.seek(tarinfo.size)
2156 target.truncate()
Andrew Svetlov718df1d2012-11-29 14:20:47 +02002157 else:
Lars Gustäbel03572682015-07-06 09:27:24 +02002158 copyfileobj(source, target, tarinfo.size, ReadError)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002159
2160 def makeunknown(self, tarinfo, targetpath):
2161 """Make a file from a TarInfo object with an unknown type
2162 at targetpath.
2163 """
2164 self.makefile(tarinfo, targetpath)
2165 self._dbg(1, "tarfile: Unknown file type %r, " \
2166 "extracted as regular file." % tarinfo.type)
2167
2168 def makefifo(self, tarinfo, targetpath):
2169 """Make a fifo called targetpath.
2170 """
2171 if hasattr(os, "mkfifo"):
2172 os.mkfifo(targetpath)
2173 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002174 raise ExtractError("fifo not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002175
2176 def makedev(self, tarinfo, targetpath):
2177 """Make a character or block device called targetpath.
2178 """
2179 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00002180 raise ExtractError("special devices not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002181
2182 mode = tarinfo.mode
2183 if tarinfo.isblk():
2184 mode |= stat.S_IFBLK
2185 else:
2186 mode |= stat.S_IFCHR
2187
2188 os.mknod(targetpath, mode,
2189 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2190
2191 def makelink(self, tarinfo, targetpath):
2192 """Make a (symbolic) link called targetpath. If it cannot be created
2193 (platform limitation), we try to make a copy of the referenced file
2194 instead of a link.
2195 """
Brian Curtind40e6f72010-07-08 21:39:08 +00002196 try:
Lars Gustäbel1b512722010-06-03 12:45:16 +00002197 # For systems that support symbolic and hard links.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002198 if tarinfo.issym():
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002199 os.symlink(tarinfo.linkname, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002200 else:
Neal Norwitza4f651a2004-07-20 22:07:44 +00002201 # See extract().
Lars Gustäbel1b512722010-06-03 12:45:16 +00002202 if os.path.exists(tarinfo._link_target):
2203 os.link(tarinfo._link_target, targetpath)
2204 else:
Brian Curtin82df53e2010-09-24 21:04:05 +00002205 self._extract_member(self._find_link_target(tarinfo),
2206 targetpath)
Brian Curtin16633fa2010-07-09 13:54:27 +00002207 except symlink_exception:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002208 try:
Brian Curtin16633fa2010-07-09 13:54:27 +00002209 self._extract_member(self._find_link_target(tarinfo),
2210 targetpath)
Lars Gustäbel1b512722010-06-03 12:45:16 +00002211 except KeyError:
2212 raise ExtractError("unable to resolve link inside archive")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002213
Eric V. Smith7a803892015-04-15 10:27:58 -04002214 def chown(self, tarinfo, targetpath, numeric_owner):
2215 """Set owner of targetpath according to tarinfo. If numeric_owner
2216 is True, use .gid/.uid instead of .gname/.uname.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002217 """
2218 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2219 # We have to be root to do so.
Eric V. Smith7a803892015-04-15 10:27:58 -04002220 if numeric_owner:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002221 g = tarinfo.gid
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002222 u = tarinfo.uid
Eric V. Smith7a803892015-04-15 10:27:58 -04002223 else:
2224 try:
2225 g = grp.getgrnam(tarinfo.gname)[2]
2226 except KeyError:
2227 g = tarinfo.gid
2228 try:
2229 u = pwd.getpwnam(tarinfo.uname)[2]
2230 except KeyError:
2231 u = tarinfo.uid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002232 try:
2233 if tarinfo.issym() and hasattr(os, "lchown"):
2234 os.lchown(targetpath, u, g)
2235 else:
Jesus Cea4791a242012-10-05 03:15:39 +02002236 os.chown(targetpath, u, g)
Andrew Svetlov3438fa42012-12-17 23:35:18 +02002237 except OSError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002238 raise ExtractError("could not change owner")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002239
2240 def chmod(self, tarinfo, targetpath):
2241 """Set file permissions of targetpath according to tarinfo.
2242 """
Jack Jansen834eff62003-03-07 12:47:06 +00002243 if hasattr(os, 'chmod'):
2244 try:
2245 os.chmod(targetpath, tarinfo.mode)
Andrew Svetlov3438fa42012-12-17 23:35:18 +02002246 except OSError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002247 raise ExtractError("could not change mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002248
2249 def utime(self, tarinfo, targetpath):
2250 """Set modification time of targetpath according to tarinfo.
2251 """
Jack Jansen834eff62003-03-07 12:47:06 +00002252 if not hasattr(os, 'utime'):
Tim Petersf9347782003-03-07 15:36:41 +00002253 return
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002254 try:
2255 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
Andrew Svetlov3438fa42012-12-17 23:35:18 +02002256 except OSError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002257 raise ExtractError("could not change modification time")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002258
2259 #--------------------------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002260 def next(self):
2261 """Return the next member of the archive as a TarInfo object, when
2262 TarFile is opened for reading. Return None if there is no more
2263 available.
2264 """
2265 self._check("ra")
2266 if self.firstmember is not None:
2267 m = self.firstmember
2268 self.firstmember = None
2269 return m
2270
Lars Gustäbel03572682015-07-06 09:27:24 +02002271 # Advance the file pointer.
2272 if self.offset != self.fileobj.tell():
2273 self.fileobj.seek(self.offset - 1)
2274 if not self.fileobj.read(1):
2275 raise ReadError("unexpected end of data")
2276
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002277 # Read the next block.
Lars Gustäbel9520a432009-11-22 18:48:49 +00002278 tarinfo = None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002279 while True:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002280 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00002281 tarinfo = self.tarinfo.fromtarfile(self)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002282 except EOFHeaderError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002283 if self.ignore_zeros:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002284 self._dbg(2, "0x%X: %s" % (self.offset, e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002285 self.offset += BLOCKSIZE
2286 continue
Lars Gustäbel9520a432009-11-22 18:48:49 +00002287 except InvalidHeaderError as e:
2288 if self.ignore_zeros:
2289 self._dbg(2, "0x%X: %s" % (self.offset, e))
2290 self.offset += BLOCKSIZE
2291 continue
2292 elif self.offset == 0:
2293 raise ReadError(str(e))
2294 except EmptyHeaderError:
2295 if self.offset == 0:
2296 raise ReadError("empty file")
2297 except TruncatedHeaderError as e:
2298 if self.offset == 0:
2299 raise ReadError(str(e))
2300 except SubsequentHeaderError as e:
2301 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002302 break
2303
Lars Gustäbel9520a432009-11-22 18:48:49 +00002304 if tarinfo is not None:
2305 self.members.append(tarinfo)
2306 else:
2307 self._loaded = True
2308
Thomas Wouters477c8d52006-05-27 19:21:47 +00002309 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002310
2311 #--------------------------------------------------------------------------
2312 # Little helper methods:
2313
Lars Gustäbel1b512722010-06-03 12:45:16 +00002314 def _getmember(self, name, tarinfo=None, normalize=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002315 """Find an archive member by name from bottom to top.
2316 If tarinfo is given, it is used as the starting point.
2317 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002318 # Ensure that all members have been loaded.
2319 members = self.getmembers()
2320
Lars Gustäbel1b512722010-06-03 12:45:16 +00002321 # Limit the member search list up to tarinfo.
2322 if tarinfo is not None:
2323 members = members[:members.index(tarinfo)]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002324
Lars Gustäbel1b512722010-06-03 12:45:16 +00002325 if normalize:
2326 name = os.path.normpath(name)
2327
2328 for member in reversed(members):
2329 if normalize:
2330 member_name = os.path.normpath(member.name)
2331 else:
2332 member_name = member.name
2333
2334 if name == member_name:
2335 return member
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002336
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002337 def _load(self):
2338 """Read through the entire archive file and look for readable
2339 members.
2340 """
2341 while True:
2342 tarinfo = self.next()
2343 if tarinfo is None:
2344 break
2345 self._loaded = True
2346
2347 def _check(self, mode=None):
2348 """Check if TarFile is still open, and if the operation's mode
2349 corresponds to TarFile's mode.
2350 """
2351 if self.closed:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02002352 raise OSError("%s is closed" % self.__class__.__name__)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002353 if mode is not None and self.mode not in mode:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02002354 raise OSError("bad operation for mode %r" % self.mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002355
Lars Gustäbel1b512722010-06-03 12:45:16 +00002356 def _find_link_target(self, tarinfo):
2357 """Find the target member of a symlink or hardlink member in the
2358 archive.
2359 """
2360 if tarinfo.issym():
2361 # Always search the entire archive.
Lars Gustäbel1ef9eda2012-04-24 21:04:40 +02002362 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
Lars Gustäbel1b512722010-06-03 12:45:16 +00002363 limit = None
2364 else:
2365 # Search the archive before the link, because a hard link is
2366 # just a reference to an already archived file.
2367 linkname = tarinfo.linkname
2368 limit = tarinfo
2369
2370 member = self._getmember(linkname, tarinfo=limit, normalize=True)
2371 if member is None:
2372 raise KeyError("linkname %r not found" % linkname)
2373 return member
2374
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002375 def __iter__(self):
2376 """Provide an iterator object.
2377 """
2378 if self._loaded:
2379 return iter(self.members)
2380 else:
2381 return TarIter(self)
2382
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002383 def _dbg(self, level, msg):
2384 """Write debugging output to sys.stderr.
2385 """
2386 if level <= self.debug:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002387 print(msg, file=sys.stderr)
Lars Gustäbel01385812010-03-03 12:08:54 +00002388
2389 def __enter__(self):
2390 self._check()
2391 return self
2392
2393 def __exit__(self, type, value, traceback):
2394 if type is None:
2395 self.close()
2396 else:
2397 # An exception occurred. We must not call close() because
2398 # it would try to write end-of-archive blocks and padding.
2399 if not self._extfileobj:
2400 self.fileobj.close()
2401 self.closed = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002402# class TarFile
2403
2404class TarIter:
2405 """Iterator Class.
2406
2407 for tarinfo in TarFile(...):
2408 suite...
2409 """
2410
2411 def __init__(self, tarfile):
2412 """Construct a TarIter object.
2413 """
2414 self.tarfile = tarfile
Martin v. Löwis637431b2005-03-03 23:12:42 +00002415 self.index = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002416 def __iter__(self):
2417 """Return iterator object.
2418 """
2419 return self
Georg Brandla18af4e2007-04-21 15:47:16 +00002420 def __next__(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002421 """Return the next item using TarFile's next() method.
2422 When all members have been read, set TarFile as _loaded.
2423 """
Martin v. Löwis637431b2005-03-03 23:12:42 +00002424 # Fix for SF #1100429: Under rare circumstances it can
2425 # happen that getmembers() is called during iteration,
2426 # which will cause TarIter to stop prematurely.
Serhiy Storchaka263fab92013-05-09 14:22:26 +03002427
2428 if self.index == 0 and self.tarfile.firstmember is not None:
2429 tarinfo = self.tarfile.next()
2430 elif self.index < len(self.tarfile.members):
2431 tarinfo = self.tarfile.members[self.index]
2432 elif not self.tarfile._loaded:
Martin v. Löwis637431b2005-03-03 23:12:42 +00002433 tarinfo = self.tarfile.next()
2434 if not tarinfo:
2435 self.tarfile._loaded = True
2436 raise StopIteration
2437 else:
Serhiy Storchaka263fab92013-05-09 14:22:26 +03002438 raise StopIteration
Martin v. Löwis637431b2005-03-03 23:12:42 +00002439 self.index += 1
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002440 return tarinfo
2441
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002442#--------------------
2443# exported functions
2444#--------------------
2445def is_tarfile(name):
2446 """Return True if name points to a tar archive that we
2447 are able to handle, else return False.
2448 """
2449 try:
2450 t = open(name)
2451 t.close()
2452 return True
2453 except TarError:
2454 return False
2455
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002456open = TarFile.open
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002457
2458
2459def main():
2460 import argparse
2461
2462 description = 'A simple command line interface for tarfile module.'
2463 parser = argparse.ArgumentParser(description=description)
2464 parser.add_argument('-v', '--verbose', action='store_true', default=False,
2465 help='Verbose output')
2466 group = parser.add_mutually_exclusive_group()
2467 group.add_argument('-l', '--list', metavar='<tarfile>',
2468 help='Show listing of a tarfile')
2469 group.add_argument('-e', '--extract', nargs='+',
2470 metavar=('<tarfile>', '<output_dir>'),
2471 help='Extract tarfile into target dir')
2472 group.add_argument('-c', '--create', nargs='+',
2473 metavar=('<name>', '<file>'),
2474 help='Create tarfile from sources')
2475 group.add_argument('-t', '--test', metavar='<tarfile>',
2476 help='Test if a tarfile is valid')
2477 args = parser.parse_args()
2478
2479 if args.test:
2480 src = args.test
2481 if is_tarfile(src):
2482 with open(src, 'r') as tar:
2483 tar.getmembers()
2484 print(tar.getmembers(), file=sys.stderr)
2485 if args.verbose:
2486 print('{!r} is a tar archive.'.format(src))
2487 else:
2488 parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2489
2490 elif args.list:
2491 src = args.list
2492 if is_tarfile(src):
2493 with TarFile.open(src, 'r:*') as tf:
2494 tf.list(verbose=args.verbose)
2495 else:
2496 parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2497
2498 elif args.extract:
2499 if len(args.extract) == 1:
2500 src = args.extract[0]
2501 curdir = os.curdir
2502 elif len(args.extract) == 2:
2503 src, curdir = args.extract
2504 else:
2505 parser.exit(1, parser.format_help())
2506
2507 if is_tarfile(src):
2508 with TarFile.open(src, 'r:*') as tf:
2509 tf.extractall(path=curdir)
2510 if args.verbose:
2511 if curdir == '.':
2512 msg = '{!r} file is extracted.'.format(src)
2513 else:
2514 msg = ('{!r} file is extracted '
2515 'into {!r} directory.').format(src, curdir)
2516 print(msg)
2517 else:
2518 parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2519
2520 elif args.create:
2521 tar_name = args.create.pop(0)
2522 _, ext = os.path.splitext(tar_name)
2523 compressions = {
2524 # gz
Serhiy Storchaka832dd5f2015-02-10 08:45:53 +02002525 '.gz': 'gz',
2526 '.tgz': 'gz',
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002527 # xz
Serhiy Storchaka832dd5f2015-02-10 08:45:53 +02002528 '.xz': 'xz',
2529 '.txz': 'xz',
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002530 # bz2
Serhiy Storchaka832dd5f2015-02-10 08:45:53 +02002531 '.bz2': 'bz2',
2532 '.tbz': 'bz2',
2533 '.tbz2': 'bz2',
2534 '.tb2': 'bz2',
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002535 }
2536 tar_mode = 'w:' + compressions[ext] if ext in compressions else 'w'
2537 tar_files = args.create
2538
2539 with TarFile.open(tar_name, tar_mode) as tf:
2540 for file_name in tar_files:
2541 tf.add(file_name)
2542
2543 if args.verbose:
2544 print('{!r} file created.'.format(tar_name))
2545
2546 else:
2547 parser.exit(1, parser.format_help())
2548
2549if __name__ == '__main__':
2550 main()