blob: bf2234f63762db6756ad822b5359aa0a2c1785b4 [file] [log] [blame]
Benjamin Peterson90f5ba52010-03-11 22:53:45 +00001#!/usr/bin/env python3
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
Christian Heimes9c1257e2007-11-04 11:37:22 +00005# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00006# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
Guido van Rossumd8faa362007-04-27 19:54:29 +000032version = "0.9.0"
Guido van Rossum98297ee2007-11-06 21:34:58 +000033__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)"
Senthil Kumaran7c9719c2011-07-28 22:32:49 +080034__date__ = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $"
35__cvsid__ = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $"
Guido van Rossum98297ee2007-11-06 21:34:58 +000036__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000037
38#---------
39# Imports
40#---------
Serhiy Storchakacf4a2f22015-03-11 17:18:03 +020041from builtins import open as bltn_open
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000042import sys
43import os
Eli Bendersky74c503b2012-01-03 06:26:13 +020044import io
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000045import shutil
46import stat
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000047import time
48import struct
Thomas Wouters89f507f2006-12-13 04:49:30 +000049import copy
Guido van Rossumd8faa362007-04-27 19:54:29 +000050import re
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000051
52try:
53 import grp, pwd
Brett Cannoncd171c82013-07-04 17:43:24 -040054except ImportError:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000055 grp = pwd = None
56
Brian Curtin16633fa2010-07-09 13:54:27 +000057# os.symlink on Windows prior to 6.0 raises NotImplementedError
58symlink_exception = (AttributeError, NotImplementedError)
59try:
Andrew Svetlov2606a6f2012-12-19 14:33:35 +020060 # OSError (winerror=1314) will be raised if the caller does not hold the
Brian Curtin16633fa2010-07-09 13:54:27 +000061 # SeCreateSymbolicLinkPrivilege privilege
Andrew Svetlov2606a6f2012-12-19 14:33:35 +020062 symlink_exception += (OSError,)
Brian Curtin16633fa2010-07-09 13:54:27 +000063except NameError:
64 pass
65
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000066# from tarfile import *
67__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
68
69#---------------------------------------------------------
70# tar constants
71#---------------------------------------------------------
Lars Gustäbelb506dc32007-08-07 18:36:16 +000072NUL = b"\0" # the null character
Guido van Rossumd8faa362007-04-27 19:54:29 +000073BLOCKSIZE = 512 # length of processing blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000074RECORDSIZE = BLOCKSIZE * 20 # length of records
Lars Gustäbelb506dc32007-08-07 18:36:16 +000075GNU_MAGIC = b"ustar \0" # magic gnu tar string
76POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000077
Guido van Rossumd8faa362007-04-27 19:54:29 +000078LENGTH_NAME = 100 # maximum length of a filename
79LENGTH_LINK = 100 # maximum length of a linkname
80LENGTH_PREFIX = 155 # maximum length of the prefix field
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000081
Lars Gustäbelb506dc32007-08-07 18:36:16 +000082REGTYPE = b"0" # regular file
83AREGTYPE = b"\0" # regular file
84LNKTYPE = b"1" # link (inside tarfile)
85SYMTYPE = b"2" # symbolic link
86CHRTYPE = b"3" # character special device
87BLKTYPE = b"4" # block special device
88DIRTYPE = b"5" # directory
89FIFOTYPE = b"6" # fifo special device
90CONTTYPE = b"7" # contiguous file
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000091
Lars Gustäbelb506dc32007-08-07 18:36:16 +000092GNUTYPE_LONGNAME = b"L" # GNU tar longname
93GNUTYPE_LONGLINK = b"K" # GNU tar longlink
94GNUTYPE_SPARSE = b"S" # GNU tar sparse file
Guido van Rossumd8faa362007-04-27 19:54:29 +000095
Lars Gustäbelb506dc32007-08-07 18:36:16 +000096XHDTYPE = b"x" # POSIX.1-2001 extended header
97XGLTYPE = b"g" # POSIX.1-2001 global header
98SOLARIS_XHDTYPE = b"X" # Solaris extended header
Guido van Rossumd8faa362007-04-27 19:54:29 +000099
100USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
101GNU_FORMAT = 1 # GNU tar format
102PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
103DEFAULT_FORMAT = GNU_FORMAT
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000104
105#---------------------------------------------------------
106# tarfile constants
107#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000108# File types that tarfile supports:
109SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
110 SYMTYPE, DIRTYPE, FIFOTYPE,
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000111 CONTTYPE, CHRTYPE, BLKTYPE,
112 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
113 GNUTYPE_SPARSE)
114
Guido van Rossumd8faa362007-04-27 19:54:29 +0000115# File types that will be treated as a regular file.
116REGULAR_TYPES = (REGTYPE, AREGTYPE,
117 CONTTYPE, GNUTYPE_SPARSE)
118
119# File types that are part of the GNU tar format.
120GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
121 GNUTYPE_SPARSE)
122
123# Fields from a pax header that override a TarInfo attribute.
124PAX_FIELDS = ("path", "linkpath", "size", "mtime",
125 "uid", "gid", "uname", "gname")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000126
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000127# Fields from a pax header that are affected by hdrcharset.
128PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
129
Guido van Rossume7ba4952007-06-06 23:52:48 +0000130# Fields in a pax header that are numbers, all other fields
131# are treated as strings.
132PAX_NUMBER_FIELDS = {
133 "atime": float,
134 "ctime": float,
135 "mtime": float,
136 "uid": int,
137 "gid": int,
138 "size": int
139}
140
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000141#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000142# initialization
143#---------------------------------------------------------
Victor Stinner0f35e2c2010-06-11 23:46:47 +0000144if os.name in ("nt", "ce"):
145 ENCODING = "utf-8"
146else:
147 ENCODING = sys.getfilesystemencoding()
Guido van Rossumd8faa362007-04-27 19:54:29 +0000148
149#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000150# Some useful functions
151#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000152
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000153def stn(s, length, encoding, errors):
154 """Convert a string to a null-terminated bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000155 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000156 s = s.encode(encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000157 return s[:length] + (length - len(s)) * NUL
Thomas Wouters477c8d52006-05-27 19:21:47 +0000158
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000159def nts(s, encoding, errors):
160 """Convert a null-terminated bytes object to a string.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000161 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000162 p = s.find(b"\0")
163 if p != -1:
164 s = s[:p]
165 return s.decode(encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000166
Thomas Wouters477c8d52006-05-27 19:21:47 +0000167def nti(s):
168 """Convert a number field to a python number.
169 """
170 # There are two possible encodings for a number field, see
171 # itn() below.
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200172 if s[0] in (0o200, 0o377):
173 n = 0
174 for i in range(len(s) - 1):
175 n <<= 8
176 n += s[i + 1]
177 if s[0] == 0o377:
178 n = -(256 ** (len(s) - 1) - n)
179 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000180 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000181 n = int(nts(s, "ascii", "strict") or "0", 8)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000182 except ValueError:
Lars Gustäbel9520a432009-11-22 18:48:49 +0000183 raise InvalidHeaderError("invalid header")
Thomas Wouters477c8d52006-05-27 19:21:47 +0000184 return n
185
Guido van Rossumd8faa362007-04-27 19:54:29 +0000186def itn(n, digits=8, format=DEFAULT_FORMAT):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000187 """Convert a python number to a number field.
188 """
189 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
190 # octal digits followed by a null-byte, this allows values up to
191 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200192 # that if necessary. A leading 0o200 or 0o377 byte indicate this
193 # particular encoding, the following digits-1 bytes are a big-endian
194 # base-256 representation. This allows values up to (256**(digits-1))-1.
195 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
196 # number.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000197 if 0 <= n < 8 ** (digits - 1):
Ethan Furmandf3ed242014-01-05 06:50:30 -0800198 s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200199 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
200 if n >= 0:
201 s = bytearray([0o200])
202 else:
203 s = bytearray([0o377])
204 n = 256 ** digits + n
Thomas Wouters477c8d52006-05-27 19:21:47 +0000205
Guido van Rossum805365e2007-05-07 22:24:25 +0000206 for i in range(digits - 1):
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200207 s.insert(1, n & 0o377)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000208 n >>= 8
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200209 else:
210 raise ValueError("overflow in number field")
211
Thomas Wouters477c8d52006-05-27 19:21:47 +0000212 return s
213
214def calc_chksums(buf):
215 """Calculate the checksum for a member's header by summing up all
216 characters except for the chksum field which is treated as if
217 it was filled with spaces. According to the GNU tar sources,
218 some tars (Sun and NeXT) calculate chksum with signed char,
219 which will be different if there are chars in the buffer with
220 the high bit set. So we calculate two checksums, unsigned and
221 signed.
222 """
Ross Lagerwall468ff4c2012-05-17 19:49:27 +0200223 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
224 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000225 return unsigned_chksum, signed_chksum
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000226
227def copyfileobj(src, dst, length=None):
228 """Copy length bytes from fileobj src to fileobj dst.
229 If length is None, copy the entire content.
230 """
231 if length == 0:
232 return
233 if length is None:
234 shutil.copyfileobj(src, dst)
235 return
236
237 BUFSIZE = 16 * 1024
238 blocks, remainder = divmod(length, BUFSIZE)
Guido van Rossum805365e2007-05-07 22:24:25 +0000239 for b in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000240 buf = src.read(BUFSIZE)
241 if len(buf) < BUFSIZE:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200242 raise OSError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000243 dst.write(buf)
244
245 if remainder != 0:
246 buf = src.read(remainder)
247 if len(buf) < remainder:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200248 raise OSError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000249 dst.write(buf)
250 return
251
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000252def filemode(mode):
Giampaolo Rodola'ffa1d0b2012-05-15 15:30:25 +0200253 """Deprecated in this location; use stat.filemode."""
254 import warnings
255 warnings.warn("deprecated in favor of stat.filemode",
256 DeprecationWarning, 2)
257 return stat.filemode(mode)
258
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +0200259def _safe_print(s):
260 encoding = getattr(sys.stdout, 'encoding', None)
261 if encoding is not None:
262 s = s.encode(encoding, 'backslashreplace').decode(encoding)
263 print(s, end=' ')
264
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000265
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000266class TarError(Exception):
267 """Base exception."""
268 pass
269class ExtractError(TarError):
270 """General exception for extract errors."""
271 pass
272class ReadError(TarError):
Ezio Melotti30b9d5d2013-08-17 15:50:46 +0300273 """Exception for unreadable tar archives."""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000274 pass
275class CompressionError(TarError):
276 """Exception for unavailable compression methods."""
277 pass
278class StreamError(TarError):
279 """Exception for unsupported operations on stream-like TarFiles."""
280 pass
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000281class HeaderError(TarError):
Lars Gustäbel9520a432009-11-22 18:48:49 +0000282 """Base exception for header errors."""
283 pass
284class EmptyHeaderError(HeaderError):
285 """Exception for empty headers."""
286 pass
287class TruncatedHeaderError(HeaderError):
288 """Exception for truncated headers."""
289 pass
290class EOFHeaderError(HeaderError):
291 """Exception for end of file headers."""
292 pass
293class InvalidHeaderError(HeaderError):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000294 """Exception for invalid headers."""
295 pass
Lars Gustäbel9520a432009-11-22 18:48:49 +0000296class SubsequentHeaderError(HeaderError):
297 """Exception for missing and invalid extended headers."""
298 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000299
300#---------------------------
301# internal stream interface
302#---------------------------
303class _LowLevelFile:
304 """Low-level file object. Supports reading and writing.
305 It is used instead of a regular file object for streaming
306 access.
307 """
308
309 def __init__(self, name, mode):
310 mode = {
311 "r": os.O_RDONLY,
312 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
313 }[mode]
314 if hasattr(os, "O_BINARY"):
315 mode |= os.O_BINARY
Lars Gustäbeld6eb70b2010-04-29 15:37:02 +0000316 self.fd = os.open(name, mode, 0o666)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000317
318 def close(self):
319 os.close(self.fd)
320
321 def read(self, size):
322 return os.read(self.fd, size)
323
324 def write(self, s):
325 os.write(self.fd, s)
326
327class _Stream:
328 """Class that serves as an adapter between TarFile and
329 a stream-like object. The stream-like object only
330 needs to have a read() or write() method and is accessed
331 blockwise. Use of gzip or bzip2 compression is possible.
332 A stream-like object could be for example: sys.stdin,
333 sys.stdout, a socket, a tape device etc.
334
335 _Stream is intended to be used only internally.
336 """
337
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000338 def __init__(self, name, mode, comptype, fileobj, bufsize):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000339 """Construct a _Stream object.
340 """
341 self._extfileobj = True
342 if fileobj is None:
343 fileobj = _LowLevelFile(name, mode)
344 self._extfileobj = False
345
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000346 if comptype == '*':
347 # Enable transparent compression detection for the
348 # stream interface
349 fileobj = _StreamProxy(fileobj)
350 comptype = fileobj.getcomptype()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000351
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000352 self.name = name or ""
353 self.mode = mode
354 self.comptype = comptype
355 self.fileobj = fileobj
356 self.bufsize = bufsize
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000357 self.buf = b""
Guido van Rossume2a383d2007-01-15 16:59:06 +0000358 self.pos = 0
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000359 self.closed = False
360
Antoine Pitrou605c2932010-09-23 20:15:14 +0000361 try:
362 if comptype == "gz":
363 try:
364 import zlib
Brett Cannoncd171c82013-07-04 17:43:24 -0400365 except ImportError:
Antoine Pitrou605c2932010-09-23 20:15:14 +0000366 raise CompressionError("zlib module is not available")
367 self.zlib = zlib
368 self.crc = zlib.crc32(b"")
369 if mode == "r":
370 self._init_read_gz()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100371 self.exception = zlib.error
Antoine Pitrou605c2932010-09-23 20:15:14 +0000372 else:
373 self._init_write_gz()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000374
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100375 elif comptype == "bz2":
Antoine Pitrou605c2932010-09-23 20:15:14 +0000376 try:
377 import bz2
Brett Cannoncd171c82013-07-04 17:43:24 -0400378 except ImportError:
Antoine Pitrou605c2932010-09-23 20:15:14 +0000379 raise CompressionError("bz2 module is not available")
380 if mode == "r":
381 self.dbuf = b""
382 self.cmp = bz2.BZ2Decompressor()
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200383 self.exception = OSError
Antoine Pitrou605c2932010-09-23 20:15:14 +0000384 else:
385 self.cmp = bz2.BZ2Compressor()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100386
387 elif comptype == "xz":
388 try:
389 import lzma
Brett Cannoncd171c82013-07-04 17:43:24 -0400390 except ImportError:
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100391 raise CompressionError("lzma module is not available")
392 if mode == "r":
393 self.dbuf = b""
394 self.cmp = lzma.LZMADecompressor()
395 self.exception = lzma.LZMAError
396 else:
397 self.cmp = lzma.LZMACompressor()
398
399 elif comptype != "tar":
400 raise CompressionError("unknown compression type %r" % comptype)
401
Antoine Pitrou605c2932010-09-23 20:15:14 +0000402 except:
403 if not self._extfileobj:
404 self.fileobj.close()
405 self.closed = True
406 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000407
408 def __del__(self):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000409 if hasattr(self, "closed") and not self.closed:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000410 self.close()
411
412 def _init_write_gz(self):
413 """Initialize for writing with gzip compression.
414 """
415 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
416 -self.zlib.MAX_WBITS,
417 self.zlib.DEF_MEM_LEVEL,
418 0)
Guido van Rossume2a383d2007-01-15 16:59:06 +0000419 timestamp = struct.pack("<L", int(time.time()))
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000420 self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000421 if self.name.endswith(".gz"):
422 self.name = self.name[:-3]
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000423 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
424 self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000425
426 def write(self, s):
427 """Write string s to the stream.
428 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000429 if self.comptype == "gz":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000430 self.crc = self.zlib.crc32(s, self.crc)
431 self.pos += len(s)
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000432 if self.comptype != "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000433 s = self.cmp.compress(s)
434 self.__write(s)
435
436 def __write(self, s):
437 """Write string s to the stream if a whole new block
438 is ready to be written.
439 """
440 self.buf += s
441 while len(self.buf) > self.bufsize:
442 self.fileobj.write(self.buf[:self.bufsize])
443 self.buf = self.buf[self.bufsize:]
444
445 def close(self):
446 """Close the _Stream object. No operation should be
447 done on it afterwards.
448 """
449 if self.closed:
450 return
451
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000452 self.closed = True
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300453 try:
454 if self.mode == "w" and self.comptype != "tar":
455 self.buf += self.cmp.flush()
456
457 if self.mode == "w" and self.buf:
458 self.fileobj.write(self.buf)
459 self.buf = b""
460 if self.comptype == "gz":
461 # The native zlib crc is an unsigned 32-bit integer, but
462 # the Python wrapper implicitly casts that to a signed C
463 # long. So, on a 32-bit box self.crc may "look negative",
464 # while the same crc on a 64-bit box may "look positive".
465 # To avoid irksome warnings from the `struct` module, force
466 # it to look positive on all boxes.
467 self.fileobj.write(struct.pack("<L", self.crc & 0xffffffff))
468 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
469 finally:
470 if not self._extfileobj:
471 self.fileobj.close()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000472
473 def _init_read_gz(self):
474 """Initialize for reading a gzip compressed fileobj.
475 """
476 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000477 self.dbuf = b""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000478
479 # taken from gzip.GzipFile with some alterations
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000480 if self.__read(2) != b"\037\213":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000481 raise ReadError("not a gzip file")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000482 if self.__read(1) != b"\010":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000483 raise CompressionError("unsupported compression method")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000484
485 flag = ord(self.__read(1))
486 self.__read(6)
487
488 if flag & 4:
489 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
490 self.read(xlen)
491 if flag & 8:
492 while True:
493 s = self.__read(1)
494 if not s or s == NUL:
495 break
496 if flag & 16:
497 while True:
498 s = self.__read(1)
499 if not s or s == NUL:
500 break
501 if flag & 2:
502 self.__read(2)
503
504 def tell(self):
505 """Return the stream's file pointer position.
506 """
507 return self.pos
508
509 def seek(self, pos=0):
510 """Set the stream's file pointer to pos. Negative seeking
511 is forbidden.
512 """
513 if pos - self.pos >= 0:
514 blocks, remainder = divmod(pos - self.pos, self.bufsize)
Guido van Rossum805365e2007-05-07 22:24:25 +0000515 for i in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000516 self.read(self.bufsize)
517 self.read(remainder)
518 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000519 raise StreamError("seeking backwards is not allowed")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000520 return self.pos
521
522 def read(self, size=None):
523 """Return the next size number of bytes from the stream.
524 If size is not defined, return all bytes of the stream
525 up to EOF.
526 """
527 if size is None:
528 t = []
529 while True:
530 buf = self._read(self.bufsize)
531 if not buf:
532 break
533 t.append(buf)
534 buf = "".join(t)
535 else:
536 buf = self._read(size)
537 self.pos += len(buf)
538 return buf
539
540 def _read(self, size):
541 """Return size bytes from the stream.
542 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000543 if self.comptype == "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000544 return self.__read(size)
545
546 c = len(self.dbuf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000547 while c < size:
548 buf = self.__read(self.bufsize)
549 if not buf:
550 break
Guido van Rossumd8faa362007-04-27 19:54:29 +0000551 try:
552 buf = self.cmp.decompress(buf)
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100553 except self.exception:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000554 raise ReadError("invalid compressed data")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000555 self.dbuf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000556 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000557 buf = self.dbuf[:size]
558 self.dbuf = self.dbuf[size:]
559 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000560
561 def __read(self, size):
562 """Return size bytes from stream. If internal buffer is empty,
563 read another block from the stream.
564 """
565 c = len(self.buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000566 while c < size:
567 buf = self.fileobj.read(self.bufsize)
568 if not buf:
569 break
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000570 self.buf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000571 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000572 buf = self.buf[:size]
573 self.buf = self.buf[size:]
574 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000575# class _Stream
576
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000577class _StreamProxy(object):
578 """Small proxy class that enables transparent compression
579 detection for the Stream interface (mode 'r|*').
580 """
581
582 def __init__(self, fileobj):
583 self.fileobj = fileobj
584 self.buf = self.fileobj.read(BLOCKSIZE)
585
586 def read(self, size):
587 self.read = self.fileobj.read
588 return self.buf
589
590 def getcomptype(self):
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100591 if self.buf.startswith(b"\x1f\x8b\x08"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000592 return "gz"
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100593 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000594 return "bz2"
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100595 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
596 return "xz"
597 else:
598 return "tar"
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000599
600 def close(self):
601 self.fileobj.close()
602# class StreamProxy
603
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000604#------------------------
605# Extraction file object
606#------------------------
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000607class _FileInFile(object):
608 """A thin wrapper around an existing file object that
609 provides a part of its data as an individual file
610 object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000611 """
612
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000613 def __init__(self, fileobj, offset, size, blockinfo=None):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000614 self.fileobj = fileobj
615 self.offset = offset
616 self.size = size
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000617 self.position = 0
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200618 self.name = getattr(fileobj, "name", None)
619 self.closed = False
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000620
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000621 if blockinfo is None:
622 blockinfo = [(0, size)]
623
624 # Construct a map with data and zero blocks.
625 self.map_index = 0
626 self.map = []
627 lastpos = 0
628 realpos = self.offset
629 for offset, size in blockinfo:
630 if offset > lastpos:
631 self.map.append((False, lastpos, offset, None))
632 self.map.append((True, offset, offset + size, realpos))
633 realpos += size
634 lastpos = offset + size
635 if lastpos < self.size:
636 self.map.append((False, lastpos, self.size, None))
637
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200638 def flush(self):
639 pass
640
641 def readable(self):
642 return True
643
644 def writable(self):
645 return False
646
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000647 def seekable(self):
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000648 return self.fileobj.seekable()
649
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000650 def tell(self):
651 """Return the current file position.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000652 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000653 return self.position
654
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200655 def seek(self, position, whence=io.SEEK_SET):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000656 """Seek to a position in the file.
657 """
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200658 if whence == io.SEEK_SET:
659 self.position = min(max(position, 0), self.size)
660 elif whence == io.SEEK_CUR:
661 if position < 0:
662 self.position = max(self.position + position, 0)
663 else:
664 self.position = min(self.position + position, self.size)
665 elif whence == io.SEEK_END:
666 self.position = max(min(self.size + position, self.size), 0)
667 else:
668 raise ValueError("Invalid argument")
669 return self.position
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000670
671 def read(self, size=None):
672 """Read data from the file.
673 """
674 if size is None:
675 size = self.size - self.position
676 else:
677 size = min(size, self.size - self.position)
678
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000679 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000680 while size > 0:
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000681 while True:
682 data, start, stop, offset = self.map[self.map_index]
683 if start <= self.position < stop:
684 break
685 else:
686 self.map_index += 1
687 if self.map_index == len(self.map):
688 self.map_index = 0
689 length = min(size, stop - self.position)
690 if data:
Lars Gustäbeldd071042011-02-23 11:42:22 +0000691 self.fileobj.seek(offset + (self.position - start))
692 buf += self.fileobj.read(length)
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000693 else:
694 buf += NUL * length
695 size -= length
696 self.position += length
697 return buf
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000698
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200699 def readinto(self, b):
700 buf = self.read(len(b))
701 b[:len(buf)] = buf
702 return len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000703
704 def close(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000705 self.closed = True
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200706#class _FileInFile
Martin v. Löwisdf241532005-03-03 08:17:42 +0000707
Lars Gustäbelb062a2f2012-05-14 13:18:16 +0200708class ExFileObject(io.BufferedReader):
709
710 def __init__(self, tarfile, tarinfo):
711 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
712 tarinfo.size, tarinfo.sparse)
713 super().__init__(fileobj)
714#class ExFileObject
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000715
716#------------------
717# Exported Classes
718#------------------
719class TarInfo(object):
720 """Informational class which holds the details about an
721 archive member given by a tar header block.
722 TarInfo objects are returned by TarFile.getmember(),
723 TarFile.getmembers() and TarFile.gettarinfo() and are
724 usually created internally.
725 """
726
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000727 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
728 "chksum", "type", "linkname", "uname", "gname",
729 "devmajor", "devminor",
730 "offset", "offset_data", "pax_headers", "sparse",
731 "tarfile", "_sparse_structs", "_link_target")
732
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000733 def __init__(self, name=""):
734 """Construct a TarInfo object. name is the optional name
735 of the member.
736 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000737 self.name = name # member name
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000738 self.mode = 0o644 # file permissions
Thomas Wouters477c8d52006-05-27 19:21:47 +0000739 self.uid = 0 # user id
740 self.gid = 0 # group id
741 self.size = 0 # file size
742 self.mtime = 0 # modification time
743 self.chksum = 0 # header checksum
744 self.type = REGTYPE # member type
745 self.linkname = "" # link name
Lars Gustäbel331b8002010-10-04 15:18:47 +0000746 self.uname = "" # user name
747 self.gname = "" # group name
Thomas Wouters477c8d52006-05-27 19:21:47 +0000748 self.devmajor = 0 # device major number
749 self.devminor = 0 # device minor number
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000750
Thomas Wouters477c8d52006-05-27 19:21:47 +0000751 self.offset = 0 # the tar header starts here
752 self.offset_data = 0 # the file's data starts here
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000753
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000754 self.sparse = None # sparse member information
Guido van Rossumd8faa362007-04-27 19:54:29 +0000755 self.pax_headers = {} # pax header information
756
757 # In pax headers the "name" and "linkname" field are called
758 # "path" and "linkpath".
759 def _getpath(self):
760 return self.name
761 def _setpath(self, name):
762 self.name = name
763 path = property(_getpath, _setpath)
764
765 def _getlinkpath(self):
766 return self.linkname
767 def _setlinkpath(self, linkname):
768 self.linkname = linkname
769 linkpath = property(_getlinkpath, _setlinkpath)
770
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000771 def __repr__(self):
772 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
773
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000774 def get_info(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000775 """Return the TarInfo's attributes as a dictionary.
776 """
777 info = {
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000778 "name": self.name,
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000779 "mode": self.mode & 0o7777,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000780 "uid": self.uid,
781 "gid": self.gid,
782 "size": self.size,
783 "mtime": self.mtime,
784 "chksum": self.chksum,
785 "type": self.type,
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000786 "linkname": self.linkname,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000787 "uname": self.uname,
788 "gname": self.gname,
789 "devmajor": self.devmajor,
790 "devminor": self.devminor
791 }
792
793 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
794 info["name"] += "/"
795
796 return info
797
Victor Stinnerde629d42010-05-05 21:43:57 +0000798 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000799 """Return a tar header as a string of 512 byte blocks.
800 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000801 info = self.get_info()
Guido van Rossume7ba4952007-06-06 23:52:48 +0000802
Guido van Rossumd8faa362007-04-27 19:54:29 +0000803 if format == USTAR_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000804 return self.create_ustar_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000805 elif format == GNU_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000806 return self.create_gnu_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000807 elif format == PAX_FORMAT:
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000808 return self.create_pax_header(info, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000809 else:
810 raise ValueError("invalid format")
811
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000812 def create_ustar_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000813 """Return the object as a ustar header block.
814 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000815 info["magic"] = POSIX_MAGIC
816
817 if len(info["linkname"]) > LENGTH_LINK:
818 raise ValueError("linkname is too long")
819
820 if len(info["name"]) > LENGTH_NAME:
821 info["prefix"], info["name"] = self._posix_split_name(info["name"])
822
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000823 return self._create_header(info, USTAR_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000824
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000825 def create_gnu_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000826 """Return the object as a GNU header block sequence.
827 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000828 info["magic"] = GNU_MAGIC
829
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000830 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +0000831 if len(info["linkname"]) > LENGTH_LINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000832 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000833
834 if len(info["name"]) > LENGTH_NAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000835 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000836
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000837 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000838
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000839 def create_pax_header(self, info, encoding):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000840 """Return the object as a ustar header block. If it cannot be
841 represented this way, prepend a pax extended header sequence
842 with supplement information.
843 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000844 info["magic"] = POSIX_MAGIC
845 pax_headers = self.pax_headers.copy()
846
847 # Test string fields for values that exceed the field length or cannot
848 # be represented in ASCII encoding.
849 for name, hname, length in (
850 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
851 ("uname", "uname", 32), ("gname", "gname", 32)):
852
Guido van Rossume7ba4952007-06-06 23:52:48 +0000853 if hname in pax_headers:
854 # The pax header has priority.
855 continue
856
Guido van Rossumd8faa362007-04-27 19:54:29 +0000857 # Try to encode the string as ASCII.
858 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000859 info[name].encode("ascii", "strict")
Guido van Rossumd8faa362007-04-27 19:54:29 +0000860 except UnicodeEncodeError:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000861 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +0000862 continue
863
Guido van Rossume7ba4952007-06-06 23:52:48 +0000864 if len(info[name]) > length:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000865 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +0000866
867 # Test number fields for values that exceed the field limit or values
868 # that like to be stored as float.
869 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
Guido van Rossume7ba4952007-06-06 23:52:48 +0000870 if name in pax_headers:
871 # The pax header has priority. Avoid overflow.
872 info[name] = 0
873 continue
874
Guido van Rossumd8faa362007-04-27 19:54:29 +0000875 val = info[name]
876 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000877 pax_headers[name] = str(val)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000878 info[name] = 0
879
Guido van Rossume7ba4952007-06-06 23:52:48 +0000880 # Create a pax extended header if necessary.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000881 if pax_headers:
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000882 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000883 else:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000884 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +0000885
Lars Gustäbel3741eff2007-08-21 12:17:05 +0000886 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
Guido van Rossumd8faa362007-04-27 19:54:29 +0000887
888 @classmethod
Lars Gustäbel3741eff2007-08-21 12:17:05 +0000889 def create_pax_global_header(cls, pax_headers):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000890 """Return the object as a pax global header block sequence.
891 """
Marc-André Lemburg8f36af72011-02-25 15:42:01 +0000892 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
Guido van Rossumd8faa362007-04-27 19:54:29 +0000893
894 def _posix_split_name(self, name):
895 """Split a name longer than 100 chars into a prefix
896 and a name part.
897 """
898 prefix = name[:LENGTH_PREFIX + 1]
899 while prefix and prefix[-1] != "/":
900 prefix = prefix[:-1]
901
902 name = name[len(prefix):]
903 prefix = prefix[:-1]
904
905 if not prefix or len(name) > LENGTH_NAME:
906 raise ValueError("name is too long")
907 return prefix, name
908
909 @staticmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000910 def _create_header(info, format, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000911 """Return a header block. info is a dictionary with file
912 information, format must be one of the *_FORMAT constants.
913 """
914 parts = [
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000915 stn(info.get("name", ""), 100, encoding, errors),
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000916 itn(info.get("mode", 0) & 0o7777, 8, format),
Guido van Rossumd8faa362007-04-27 19:54:29 +0000917 itn(info.get("uid", 0), 8, format),
918 itn(info.get("gid", 0), 8, format),
919 itn(info.get("size", 0), 12, format),
920 itn(info.get("mtime", 0), 12, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000921 b" ", # checksum field
Guido van Rossumd8faa362007-04-27 19:54:29 +0000922 info.get("type", REGTYPE),
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000923 stn(info.get("linkname", ""), 100, encoding, errors),
924 info.get("magic", POSIX_MAGIC),
Lars Gustäbel331b8002010-10-04 15:18:47 +0000925 stn(info.get("uname", ""), 32, encoding, errors),
926 stn(info.get("gname", ""), 32, encoding, errors),
Guido van Rossumd8faa362007-04-27 19:54:29 +0000927 itn(info.get("devmajor", 0), 8, format),
928 itn(info.get("devminor", 0), 8, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000929 stn(info.get("prefix", ""), 155, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000930 ]
931
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000932 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
Guido van Rossumd8faa362007-04-27 19:54:29 +0000933 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
Lars Gustäbela280ca752007-08-28 07:34:33 +0000934 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
Guido van Rossumd8faa362007-04-27 19:54:29 +0000935 return buf
936
937 @staticmethod
938 def _create_payload(payload):
939 """Return the string payload filled with zero bytes
940 up to the next 512 byte border.
941 """
942 blocks, remainder = divmod(len(payload), BLOCKSIZE)
943 if remainder > 0:
944 payload += (BLOCKSIZE - remainder) * NUL
945 return payload
946
947 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000948 def _create_gnu_long_header(cls, name, type, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000949 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
950 for name.
951 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000952 name = name.encode(encoding, errors) + NUL
Guido van Rossumd8faa362007-04-27 19:54:29 +0000953
954 info = {}
955 info["name"] = "././@LongLink"
956 info["type"] = type
957 info["size"] = len(name)
958 info["magic"] = GNU_MAGIC
959
960 # create extended header + name blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000961 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
Guido van Rossumd8faa362007-04-27 19:54:29 +0000962 cls._create_payload(name)
963
964 @classmethod
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000965 def _create_pax_generic_header(cls, pax_headers, type, encoding):
966 """Return a POSIX.1-2008 extended or global header sequence
Guido van Rossumd8faa362007-04-27 19:54:29 +0000967 that contains a list of keyword, value pairs. The values
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000968 must be strings.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000969 """
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000970 # Check if one of the fields contains surrogate characters and thereby
971 # forces hdrcharset=BINARY, see _proc_pax() for more information.
972 binary = False
973 for keyword, value in pax_headers.items():
974 try:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +0000975 value.encode("utf-8", "strict")
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000976 except UnicodeEncodeError:
977 binary = True
978 break
979
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000980 records = b""
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000981 if binary:
982 # Put the hdrcharset field at the beginning of the header.
983 records += b"21 hdrcharset=BINARY\n"
984
Guido van Rossumd8faa362007-04-27 19:54:29 +0000985 for keyword, value in pax_headers.items():
Marc-André Lemburg8f36af72011-02-25 15:42:01 +0000986 keyword = keyword.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000987 if binary:
988 # Try to restore the original byte representation of `value'.
989 # Needless to say, that the encoding must match the string.
990 value = value.encode(encoding, "surrogateescape")
991 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +0000992 value = value.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000993
Guido van Rossumd8faa362007-04-27 19:54:29 +0000994 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
995 n = p = 0
996 while True:
997 n = l + len(str(p))
998 if n == p:
999 break
1000 p = n
Lars Gustäbela280ca752007-08-28 07:34:33 +00001001 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
Guido van Rossumd8faa362007-04-27 19:54:29 +00001002
1003 # We use a hardcoded "././@PaxHeader" name like star does
1004 # instead of the one that POSIX recommends.
1005 info = {}
1006 info["name"] = "././@PaxHeader"
1007 info["type"] = type
1008 info["size"] = len(records)
1009 info["magic"] = POSIX_MAGIC
1010
1011 # Create pax header + record blocks.
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001012 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001013 cls._create_payload(records)
1014
Guido van Rossum75b64e62005-01-16 00:16:11 +00001015 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001016 def frombuf(cls, buf, encoding, errors):
1017 """Construct a TarInfo object from a 512 byte bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001018 """
Lars Gustäbel9520a432009-11-22 18:48:49 +00001019 if len(buf) == 0:
1020 raise EmptyHeaderError("empty header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001021 if len(buf) != BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001022 raise TruncatedHeaderError("truncated header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001023 if buf.count(NUL) == BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001024 raise EOFHeaderError("end of file header")
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001025
1026 chksum = nti(buf[148:156])
1027 if chksum not in calc_chksums(buf):
Lars Gustäbel9520a432009-11-22 18:48:49 +00001028 raise InvalidHeaderError("bad checksum")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001029
Guido van Rossumd8faa362007-04-27 19:54:29 +00001030 obj = cls()
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001031 obj.name = nts(buf[0:100], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001032 obj.mode = nti(buf[100:108])
1033 obj.uid = nti(buf[108:116])
1034 obj.gid = nti(buf[116:124])
1035 obj.size = nti(buf[124:136])
1036 obj.mtime = nti(buf[136:148])
1037 obj.chksum = chksum
1038 obj.type = buf[156:157]
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001039 obj.linkname = nts(buf[157:257], encoding, errors)
1040 obj.uname = nts(buf[265:297], encoding, errors)
1041 obj.gname = nts(buf[297:329], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001042 obj.devmajor = nti(buf[329:337])
1043 obj.devminor = nti(buf[337:345])
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001044 prefix = nts(buf[345:500], encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001045
Guido van Rossumd8faa362007-04-27 19:54:29 +00001046 # Old V7 tar format represents a directory as a regular
1047 # file with a trailing slash.
1048 if obj.type == AREGTYPE and obj.name.endswith("/"):
1049 obj.type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001050
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001051 # The old GNU sparse format occupies some of the unused
1052 # space in the buffer for up to 4 sparse structures.
1053 # Save the them for later processing in _proc_sparse().
1054 if obj.type == GNUTYPE_SPARSE:
1055 pos = 386
1056 structs = []
1057 for i in range(4):
1058 try:
1059 offset = nti(buf[pos:pos + 12])
1060 numbytes = nti(buf[pos + 12:pos + 24])
1061 except ValueError:
1062 break
1063 structs.append((offset, numbytes))
1064 pos += 24
1065 isextended = bool(buf[482])
1066 origsize = nti(buf[483:495])
1067 obj._sparse_structs = (structs, isextended, origsize)
1068
Guido van Rossumd8faa362007-04-27 19:54:29 +00001069 # Remove redundant slashes from directories.
1070 if obj.isdir():
1071 obj.name = obj.name.rstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001072
Guido van Rossumd8faa362007-04-27 19:54:29 +00001073 # Reconstruct a ustar longname.
1074 if prefix and obj.type not in GNU_TYPES:
1075 obj.name = prefix + "/" + obj.name
1076 return obj
1077
1078 @classmethod
1079 def fromtarfile(cls, tarfile):
1080 """Return the next TarInfo object from TarFile object
1081 tarfile.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001082 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001083 buf = tarfile.fileobj.read(BLOCKSIZE)
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001084 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001085 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1086 return obj._proc_member(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001087
Guido van Rossumd8faa362007-04-27 19:54:29 +00001088 #--------------------------------------------------------------------------
1089 # The following are methods that are called depending on the type of a
1090 # member. The entry point is _proc_member() which can be overridden in a
1091 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1092 # implement the following
1093 # operations:
1094 # 1. Set self.offset_data to the position where the data blocks begin,
1095 # if there is data that follows.
1096 # 2. Set tarfile.offset to the position where the next member's header will
1097 # begin.
1098 # 3. Return self or another valid TarInfo object.
1099 def _proc_member(self, tarfile):
1100 """Choose the right processing method depending on
1101 the type and call it.
Thomas Wouters89f507f2006-12-13 04:49:30 +00001102 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001103 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1104 return self._proc_gnulong(tarfile)
1105 elif self.type == GNUTYPE_SPARSE:
1106 return self._proc_sparse(tarfile)
1107 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1108 return self._proc_pax(tarfile)
1109 else:
1110 return self._proc_builtin(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001111
Guido van Rossumd8faa362007-04-27 19:54:29 +00001112 def _proc_builtin(self, tarfile):
1113 """Process a builtin type or an unknown type which
1114 will be treated as a regular file.
1115 """
1116 self.offset_data = tarfile.fileobj.tell()
1117 offset = self.offset_data
1118 if self.isreg() or self.type not in SUPPORTED_TYPES:
1119 # Skip the following data blocks.
1120 offset += self._block(self.size)
1121 tarfile.offset = offset
Thomas Wouters89f507f2006-12-13 04:49:30 +00001122
Guido van Rossume7ba4952007-06-06 23:52:48 +00001123 # Patch the TarInfo object with saved global
Guido van Rossumd8faa362007-04-27 19:54:29 +00001124 # header information.
Guido van Rossume7ba4952007-06-06 23:52:48 +00001125 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001126
1127 return self
1128
1129 def _proc_gnulong(self, tarfile):
1130 """Process the blocks that hold a GNU longname
1131 or longlink member.
1132 """
1133 buf = tarfile.fileobj.read(self._block(self.size))
1134
1135 # Fetch the next header and process it.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001136 try:
1137 next = self.fromtarfile(tarfile)
1138 except HeaderError:
1139 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001140
1141 # Patch the TarInfo object from the next header with
1142 # the longname information.
1143 next.offset = self.offset
1144 if self.type == GNUTYPE_LONGNAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001145 next.name = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001146 elif self.type == GNUTYPE_LONGLINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001147 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001148
1149 return next
1150
1151 def _proc_sparse(self, tarfile):
1152 """Process a GNU sparse header plus extra headers.
1153 """
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001154 # We already collected some sparse structures in frombuf().
1155 structs, isextended, origsize = self._sparse_structs
1156 del self._sparse_structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001157
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001158 # Collect sparse structures from extended header blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001159 while isextended:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001160 buf = tarfile.fileobj.read(BLOCKSIZE)
1161 pos = 0
Guido van Rossum805365e2007-05-07 22:24:25 +00001162 for i in range(21):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001163 try:
1164 offset = nti(buf[pos:pos + 12])
1165 numbytes = nti(buf[pos + 12:pos + 24])
1166 except ValueError:
1167 break
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001168 if offset and numbytes:
1169 structs.append((offset, numbytes))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001170 pos += 24
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001171 isextended = bool(buf[504])
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001172 self.sparse = structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001173
1174 self.offset_data = tarfile.fileobj.tell()
1175 tarfile.offset = self.offset_data + self._block(self.size)
1176 self.size = origsize
Guido van Rossumd8faa362007-04-27 19:54:29 +00001177 return self
1178
1179 def _proc_pax(self, tarfile):
1180 """Process an extended or global header as described in
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001181 POSIX.1-2008.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001182 """
1183 # Read the header information.
1184 buf = tarfile.fileobj.read(self._block(self.size))
1185
1186 # A pax header stores supplemental information for either
1187 # the following file (extended) or all following files
1188 # (global).
1189 if self.type == XGLTYPE:
1190 pax_headers = tarfile.pax_headers
1191 else:
1192 pax_headers = tarfile.pax_headers.copy()
1193
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001194 # Check if the pax header contains a hdrcharset field. This tells us
1195 # the encoding of the path, linkpath, uname and gname fields. Normally,
1196 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1197 # implementations are allowed to store them as raw binary strings if
1198 # the translation to UTF-8 fails.
1199 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1200 if match is not None:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001201 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001202
1203 # For the time being, we don't care about anything other than "BINARY".
1204 # The only other value that is currently allowed by the standard is
1205 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1206 hdrcharset = pax_headers.get("hdrcharset")
1207 if hdrcharset == "BINARY":
1208 encoding = tarfile.encoding
1209 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001210 encoding = "utf-8"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001211
Guido van Rossumd8faa362007-04-27 19:54:29 +00001212 # Parse pax header information. A record looks like that:
1213 # "%d %s=%s\n" % (length, keyword, value). length is the size
1214 # of the complete record including the length field itself and
Guido van Rossume7ba4952007-06-06 23:52:48 +00001215 # the newline. keyword and value are both UTF-8 encoded strings.
Antoine Pitroufd036452008-08-19 17:56:33 +00001216 regex = re.compile(br"(\d+) ([^=]+)=")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001217 pos = 0
1218 while True:
1219 match = regex.match(buf, pos)
1220 if not match:
1221 break
1222
1223 length, keyword = match.groups()
1224 length = int(length)
1225 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1226
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001227 # Normally, we could just use "utf-8" as the encoding and "strict"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001228 # as the error handler, but we better not take the risk. For
1229 # example, GNU tar <= 1.23 is known to store filenames it cannot
1230 # translate to UTF-8 as raw strings (unfortunately without a
1231 # hdrcharset=BINARY header).
1232 # We first try the strict standard encoding, and if that fails we
1233 # fall back on the user's encoding and error handler.
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001234 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001235 tarfile.errors)
1236 if keyword in PAX_NAME_FIELDS:
1237 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1238 tarfile.errors)
1239 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001240 value = self._decode_pax_field(value, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001241 tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001242
1243 pax_headers[keyword] = value
1244 pos += length
1245
Guido van Rossume7ba4952007-06-06 23:52:48 +00001246 # Fetch the next header.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001247 try:
1248 next = self.fromtarfile(tarfile)
1249 except HeaderError:
1250 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001251
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001252 # Process GNU sparse information.
1253 if "GNU.sparse.map" in pax_headers:
1254 # GNU extended sparse format version 0.1.
1255 self._proc_gnusparse_01(next, pax_headers)
1256
1257 elif "GNU.sparse.size" in pax_headers:
1258 # GNU extended sparse format version 0.0.
1259 self._proc_gnusparse_00(next, pax_headers, buf)
1260
1261 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1262 # GNU extended sparse format version 1.0.
1263 self._proc_gnusparse_10(next, pax_headers, tarfile)
1264
Guido van Rossume7ba4952007-06-06 23:52:48 +00001265 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001266 # Patch the TarInfo object with the extended header info.
1267 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1268 next.offset = self.offset
1269
1270 if "size" in pax_headers:
1271 # If the extended header replaces the size field,
1272 # we need to recalculate the offset where the next
1273 # header starts.
1274 offset = next.offset_data
1275 if next.isreg() or next.type not in SUPPORTED_TYPES:
1276 offset += next._block(next.size)
1277 tarfile.offset = offset
1278
1279 return next
1280
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001281 def _proc_gnusparse_00(self, next, pax_headers, buf):
1282 """Process a GNU tar extended sparse header, version 0.0.
1283 """
1284 offsets = []
1285 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1286 offsets.append(int(match.group(1)))
1287 numbytes = []
1288 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1289 numbytes.append(int(match.group(1)))
1290 next.sparse = list(zip(offsets, numbytes))
1291
1292 def _proc_gnusparse_01(self, next, pax_headers):
1293 """Process a GNU tar extended sparse header, version 0.1.
1294 """
1295 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1296 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1297
1298 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1299 """Process a GNU tar extended sparse header, version 1.0.
1300 """
1301 fields = None
1302 sparse = []
1303 buf = tarfile.fileobj.read(BLOCKSIZE)
1304 fields, buf = buf.split(b"\n", 1)
1305 fields = int(fields)
1306 while len(sparse) < fields * 2:
1307 if b"\n" not in buf:
1308 buf += tarfile.fileobj.read(BLOCKSIZE)
1309 number, buf = buf.split(b"\n", 1)
1310 sparse.append(int(number))
1311 next.offset_data = tarfile.fileobj.tell()
1312 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1313
Guido van Rossume7ba4952007-06-06 23:52:48 +00001314 def _apply_pax_info(self, pax_headers, encoding, errors):
1315 """Replace fields with supplemental information from a previous
1316 pax extended or global header.
1317 """
1318 for keyword, value in pax_headers.items():
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001319 if keyword == "GNU.sparse.name":
1320 setattr(self, "path", value)
1321 elif keyword == "GNU.sparse.size":
1322 setattr(self, "size", int(value))
1323 elif keyword == "GNU.sparse.realsize":
1324 setattr(self, "size", int(value))
1325 elif keyword in PAX_FIELDS:
1326 if keyword in PAX_NUMBER_FIELDS:
1327 try:
1328 value = PAX_NUMBER_FIELDS[keyword](value)
1329 except ValueError:
1330 value = 0
1331 if keyword == "path":
1332 value = value.rstrip("/")
1333 setattr(self, keyword, value)
Guido van Rossume7ba4952007-06-06 23:52:48 +00001334
1335 self.pax_headers = pax_headers.copy()
Guido van Rossumd8faa362007-04-27 19:54:29 +00001336
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001337 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1338 """Decode a single field from a pax record.
1339 """
1340 try:
1341 return value.decode(encoding, "strict")
1342 except UnicodeDecodeError:
1343 return value.decode(fallback_encoding, fallback_errors)
1344
Guido van Rossumd8faa362007-04-27 19:54:29 +00001345 def _block(self, count):
1346 """Round up a byte count by BLOCKSIZE and return it,
1347 e.g. _block(834) => 1024.
1348 """
1349 blocks, remainder = divmod(count, BLOCKSIZE)
1350 if remainder:
1351 blocks += 1
1352 return blocks * BLOCKSIZE
Thomas Wouters89f507f2006-12-13 04:49:30 +00001353
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001354 def isreg(self):
1355 return self.type in REGULAR_TYPES
1356 def isfile(self):
1357 return self.isreg()
1358 def isdir(self):
1359 return self.type == DIRTYPE
1360 def issym(self):
1361 return self.type == SYMTYPE
1362 def islnk(self):
1363 return self.type == LNKTYPE
1364 def ischr(self):
1365 return self.type == CHRTYPE
1366 def isblk(self):
1367 return self.type == BLKTYPE
1368 def isfifo(self):
1369 return self.type == FIFOTYPE
1370 def issparse(self):
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001371 return self.sparse is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001372 def isdev(self):
1373 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1374# class TarInfo
1375
1376class TarFile(object):
1377 """The TarFile Class provides an interface to tar archives.
1378 """
1379
1380 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1381
1382 dereference = False # If true, add content of linked file to the
1383 # tar file, else the link.
1384
1385 ignore_zeros = False # If true, skips empty or invalid blocks and
1386 # continues processing.
1387
Lars Gustäbel365aff32009-12-13 11:42:29 +00001388 errorlevel = 1 # If 0, fatal errors only appear in debug
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001389 # messages (if debug >= 0). If > 0, errors
1390 # are passed to the caller as exceptions.
1391
Guido van Rossumd8faa362007-04-27 19:54:29 +00001392 format = DEFAULT_FORMAT # The format to use when creating an archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001393
Guido van Rossume7ba4952007-06-06 23:52:48 +00001394 encoding = ENCODING # Encoding for 8-bit character strings.
1395
1396 errors = None # Error handler for unicode conversion.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001397
Guido van Rossumd8faa362007-04-27 19:54:29 +00001398 tarinfo = TarInfo # The default TarInfo class to use.
1399
Lars Gustäbelb062a2f2012-05-14 13:18:16 +02001400 fileobject = ExFileObject # The file-object for extractfile().
Guido van Rossumd8faa362007-04-27 19:54:29 +00001401
1402 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1403 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
Victor Stinnerde629d42010-05-05 21:43:57 +00001404 errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001405 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1406 read from an existing archive, 'a' to append data to an existing
1407 file or 'w' to create a new file overwriting an existing one. `mode'
1408 defaults to 'r'.
1409 If `fileobj' is given, it is used for reading or writing data. If it
1410 can be determined, `mode' is overridden by `fileobj's mode.
1411 `fileobj' is not closed, when TarFile is closed.
1412 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001413 modes = {"r": "rb", "a": "r+b", "w": "wb", "x": "xb"}
Serhiy Storchaka53ad0cd2014-01-18 15:35:37 +02001414 if mode not in modes:
Berker Peksag0fe63252015-02-13 21:02:12 +02001415 raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001416 self.mode = mode
Serhiy Storchaka53ad0cd2014-01-18 15:35:37 +02001417 self._mode = modes[mode]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001418
1419 if not fileobj:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001420 if self.mode == "a" and not os.path.exists(name):
Thomas Wouterscf297e42007-02-23 15:07:44 +00001421 # Create nonexistent files in append mode.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001422 self.mode = "w"
1423 self._mode = "wb"
Guido van Rossume7ba4952007-06-06 23:52:48 +00001424 fileobj = bltn_open(name, self._mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001425 self._extfileobj = False
1426 else:
Serhiy Storchaka2c6a3ae2014-07-16 23:58:58 +03001427 if (name is None and hasattr(fileobj, "name") and
1428 isinstance(fileobj.name, (str, bytes))):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001429 name = fileobj.name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001430 if hasattr(fileobj, "mode"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001431 self._mode = fileobj.mode
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001432 self._extfileobj = True
Thomas Woutersed03b412007-08-28 21:37:11 +00001433 self.name = os.path.abspath(name) if name else None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001434 self.fileobj = fileobj
1435
Guido van Rossumd8faa362007-04-27 19:54:29 +00001436 # Init attributes.
1437 if format is not None:
1438 self.format = format
1439 if tarinfo is not None:
1440 self.tarinfo = tarinfo
1441 if dereference is not None:
1442 self.dereference = dereference
1443 if ignore_zeros is not None:
1444 self.ignore_zeros = ignore_zeros
1445 if encoding is not None:
1446 self.encoding = encoding
Victor Stinnerde629d42010-05-05 21:43:57 +00001447 self.errors = errors
Guido van Rossume7ba4952007-06-06 23:52:48 +00001448
1449 if pax_headers is not None and self.format == PAX_FORMAT:
1450 self.pax_headers = pax_headers
1451 else:
1452 self.pax_headers = {}
1453
Guido van Rossumd8faa362007-04-27 19:54:29 +00001454 if debug is not None:
1455 self.debug = debug
1456 if errorlevel is not None:
1457 self.errorlevel = errorlevel
1458
1459 # Init datastructures.
Thomas Wouters477c8d52006-05-27 19:21:47 +00001460 self.closed = False
1461 self.members = [] # list of members as TarInfo objects
1462 self._loaded = False # flag if all members have been read
Christian Heimesd8654cf2007-12-02 15:22:16 +00001463 self.offset = self.fileobj.tell()
1464 # current position in the archive file
Thomas Wouters477c8d52006-05-27 19:21:47 +00001465 self.inodes = {} # dictionary caching the inodes of
1466 # archive members already added
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001467
Lars Gustäbel7b465392009-11-18 20:29:25 +00001468 try:
1469 if self.mode == "r":
1470 self.firstmember = None
1471 self.firstmember = self.next()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001472
Lars Gustäbel7b465392009-11-18 20:29:25 +00001473 if self.mode == "a":
1474 # Move to the end of the archive,
1475 # before the first empty block.
Lars Gustäbel7b465392009-11-18 20:29:25 +00001476 while True:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001477 self.fileobj.seek(self.offset)
1478 try:
1479 tarinfo = self.tarinfo.fromtarfile(self)
1480 self.members.append(tarinfo)
1481 except EOFHeaderError:
1482 self.fileobj.seek(self.offset)
Lars Gustäbel7b465392009-11-18 20:29:25 +00001483 break
Lars Gustäbel9520a432009-11-22 18:48:49 +00001484 except HeaderError as e:
1485 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001486
Lars Gustäbel20703c62015-05-27 12:53:44 +02001487 if self.mode in ("a", "w", "x"):
Lars Gustäbel7b465392009-11-18 20:29:25 +00001488 self._loaded = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001489
Lars Gustäbel7b465392009-11-18 20:29:25 +00001490 if self.pax_headers:
1491 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1492 self.fileobj.write(buf)
1493 self.offset += len(buf)
1494 except:
1495 if not self._extfileobj:
1496 self.fileobj.close()
1497 self.closed = True
1498 raise
Guido van Rossumd8faa362007-04-27 19:54:29 +00001499
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001500 #--------------------------------------------------------------------------
1501 # Below are the classmethods which act as alternate constructors to the
1502 # TarFile class. The open() method is the only one that is needed for
1503 # public use; it is the "super"-constructor and is able to select an
1504 # adequate "sub"-constructor for a particular compression using the mapping
1505 # from OPEN_METH.
1506 #
1507 # This concept allows one to subclass TarFile without losing the comfort of
1508 # the super-constructor. A sub-constructor is registered and made available
1509 # by adding it to the mapping in OPEN_METH.
1510
Guido van Rossum75b64e62005-01-16 00:16:11 +00001511 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001512 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001513 """Open a tar archive for reading, writing or appending. Return
1514 an appropriate TarFile class.
1515
1516 mode:
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001517 'r' or 'r:*' open for reading with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001518 'r:' open for reading exclusively uncompressed
1519 'r:gz' open for reading with gzip compression
1520 'r:bz2' open for reading with bzip2 compression
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001521 'r:xz' open for reading with lzma compression
Thomas Wouterscf297e42007-02-23 15:07:44 +00001522 'a' or 'a:' open for appending, creating the file if necessary
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001523 'w' or 'w:' open for writing without compression
1524 'w:gz' open for writing with gzip compression
1525 'w:bz2' open for writing with bzip2 compression
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001526 'w:xz' open for writing with lzma compression
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001527
Berker Peksag0fe63252015-02-13 21:02:12 +02001528 'x' or 'x:' create a tarfile exclusively without compression, raise
1529 an exception if the file is already created
1530 'x:gz' create an gzip compressed tarfile, raise an exception
1531 if the file is already created
1532 'x:bz2' create an bzip2 compressed tarfile, raise an exception
1533 if the file is already created
1534 'x:xz' create an lzma compressed tarfile, raise an exception
1535 if the file is already created
1536
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001537 'r|*' open a stream of tar blocks with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001538 'r|' open an uncompressed stream of tar blocks for reading
1539 'r|gz' open a gzip compressed stream of tar blocks
1540 'r|bz2' open a bzip2 compressed stream of tar blocks
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001541 'r|xz' open an lzma compressed stream of tar blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001542 'w|' open an uncompressed stream for writing
1543 'w|gz' open a gzip compressed stream for writing
1544 'w|bz2' open a bzip2 compressed stream for writing
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001545 'w|xz' open an lzma compressed stream for writing
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001546 """
1547
1548 if not name and not fileobj:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001549 raise ValueError("nothing to open")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001550
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001551 if mode in ("r", "r:*"):
1552 # Find out which *open() is appropriate for opening the file.
1553 for comptype in cls.OPEN_METH:
1554 func = getattr(cls, cls.OPEN_METH[comptype])
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001555 if fileobj is not None:
1556 saved_pos = fileobj.tell()
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001557 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001558 return func(name, "r", fileobj, **kwargs)
1559 except (ReadError, CompressionError) as e:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001560 if fileobj is not None:
1561 fileobj.seek(saved_pos)
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001562 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001563 raise ReadError("file could not be opened successfully")
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001564
1565 elif ":" in mode:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001566 filemode, comptype = mode.split(":", 1)
1567 filemode = filemode or "r"
1568 comptype = comptype or "tar"
1569
1570 # Select the *open() function according to
1571 # given compression.
1572 if comptype in cls.OPEN_METH:
1573 func = getattr(cls, cls.OPEN_METH[comptype])
1574 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001575 raise CompressionError("unknown compression type %r" % comptype)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001576 return func(name, filemode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001577
1578 elif "|" in mode:
1579 filemode, comptype = mode.split("|", 1)
1580 filemode = filemode or "r"
1581 comptype = comptype or "tar"
1582
Serhiy Storchaka53ad0cd2014-01-18 15:35:37 +02001583 if filemode not in ("r", "w"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001584 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001585
Antoine Pitrou605c2932010-09-23 20:15:14 +00001586 stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1587 try:
1588 t = cls(name, filemode, stream, **kwargs)
1589 except:
1590 stream.close()
1591 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001592 t._extfileobj = False
1593 return t
1594
Berker Peksag0fe63252015-02-13 21:02:12 +02001595 elif mode in ("a", "w", "x"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001596 return cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001597
Thomas Wouters477c8d52006-05-27 19:21:47 +00001598 raise ValueError("undiscernible mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001599
Guido van Rossum75b64e62005-01-16 00:16:11 +00001600 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001601 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001602 """Open uncompressed tar archive name for reading or writing.
1603 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001604 if mode not in ("r", "a", "w", "x"):
1605 raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001606 return cls(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001607
Guido van Rossum75b64e62005-01-16 00:16:11 +00001608 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001609 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001610 """Open gzip compressed tar archive name for reading or writing.
1611 Appending is not allowed.
1612 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001613 if mode not in ("r", "w", "x"):
1614 raise ValueError("mode must be 'r', 'w' or 'x'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001615
1616 try:
1617 import gzip
Neal Norwitz4ec68242003-04-11 03:05:56 +00001618 gzip.GzipFile
1619 except (ImportError, AttributeError):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001620 raise CompressionError("gzip module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001621
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001622 try:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001623 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001624 except OSError:
1625 if fileobj is not None and mode == 'r':
1626 raise ReadError("not a gzip file")
1627 raise
1628
1629 try:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001630 t = cls.taropen(name, mode, fileobj, **kwargs)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001631 except OSError:
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001632 fileobj.close()
1633 if mode == 'r':
1634 raise ReadError("not a gzip file")
1635 raise
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001636 except:
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001637 fileobj.close()
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001638 raise
Serhiy Storchaka9fbec7a2014-01-18 15:53:05 +02001639 t._extfileobj = False
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001640 return t
1641
Guido van Rossum75b64e62005-01-16 00:16:11 +00001642 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001643 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001644 """Open bzip2 compressed tar archive name for reading or writing.
1645 Appending is not allowed.
1646 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001647 if mode not in ("r", "w", "x"):
1648 raise ValueError("mode must be 'r', 'w' or 'x'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001649
1650 try:
1651 import bz2
Brett Cannoncd171c82013-07-04 17:43:24 -04001652 except ImportError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001653 raise CompressionError("bz2 module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001654
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +02001655 fileobj = bz2.BZ2File(fileobj or name, mode,
1656 compresslevel=compresslevel)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001657
1658 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001659 t = cls.taropen(name, mode, fileobj, **kwargs)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001660 except (OSError, EOFError):
Antoine Pitrou95f55602010-09-23 18:36:46 +00001661 fileobj.close()
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001662 if mode == 'r':
1663 raise ReadError("not a bzip2 file")
1664 raise
Serhiy Storchakae413cde2014-01-18 16:28:08 +02001665 except:
1666 fileobj.close()
1667 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001668 t._extfileobj = False
1669 return t
1670
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001671 @classmethod
Lars Gustäbelc5e11992012-01-18 14:01:17 +01001672 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001673 """Open lzma compressed tar archive name for reading or writing.
1674 Appending is not allowed.
1675 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001676 if mode not in ("r", "w", "x"):
1677 raise ValueError("mode must be 'r', 'w' or 'x'")
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001678
1679 try:
1680 import lzma
Brett Cannoncd171c82013-07-04 17:43:24 -04001681 except ImportError:
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001682 raise CompressionError("lzma module is not available")
1683
Nadeem Vawda33c34da2012-06-04 23:34:07 +02001684 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001685
1686 try:
1687 t = cls.taropen(name, mode, fileobj, **kwargs)
1688 except (lzma.LZMAError, EOFError):
1689 fileobj.close()
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001690 if mode == 'r':
1691 raise ReadError("not an lzma file")
1692 raise
Serhiy Storchakae413cde2014-01-18 16:28:08 +02001693 except:
1694 fileobj.close()
1695 raise
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001696 t._extfileobj = False
1697 return t
1698
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001699 # All *open() methods are registered here.
1700 OPEN_METH = {
1701 "tar": "taropen", # uncompressed tar
1702 "gz": "gzopen", # gzip compressed tar
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001703 "bz2": "bz2open", # bzip2 compressed tar
1704 "xz": "xzopen" # lzma compressed tar
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001705 }
1706
1707 #--------------------------------------------------------------------------
1708 # The public methods which TarFile provides:
1709
1710 def close(self):
1711 """Close the TarFile. In write-mode, two finishing zero blocks are
1712 appended to the archive.
1713 """
1714 if self.closed:
1715 return
1716
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001717 self.closed = True
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +03001718 try:
Lars Gustäbel20703c62015-05-27 12:53:44 +02001719 if self.mode in ("a", "w", "x"):
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +03001720 self.fileobj.write(NUL * (BLOCKSIZE * 2))
1721 self.offset += (BLOCKSIZE * 2)
1722 # fill up the end with zero-blocks
1723 # (like option -b20 for tar does)
1724 blocks, remainder = divmod(self.offset, RECORDSIZE)
1725 if remainder > 0:
1726 self.fileobj.write(NUL * (RECORDSIZE - remainder))
1727 finally:
1728 if not self._extfileobj:
1729 self.fileobj.close()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001730
1731 def getmember(self, name):
1732 """Return a TarInfo object for member `name'. If `name' can not be
1733 found in the archive, KeyError is raised. If a member occurs more
Mark Dickinson934896d2009-02-21 20:59:32 +00001734 than once in the archive, its last occurrence is assumed to be the
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001735 most up-to-date version.
1736 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001737 tarinfo = self._getmember(name)
1738 if tarinfo is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001739 raise KeyError("filename %r not found" % name)
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001740 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001741
1742 def getmembers(self):
1743 """Return the members of the archive as a list of TarInfo objects. The
1744 list has the same order as the members in the archive.
1745 """
1746 self._check()
1747 if not self._loaded: # if we want to obtain a list of
1748 self._load() # all members, we first have to
1749 # scan the whole archive.
1750 return self.members
1751
1752 def getnames(self):
1753 """Return the members of the archive as a list of their names. It has
1754 the same order as the list returned by getmembers().
1755 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001756 return [tarinfo.name for tarinfo in self.getmembers()]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001757
1758 def gettarinfo(self, name=None, arcname=None, fileobj=None):
1759 """Create a TarInfo object for either the file `name' or the file
1760 object `fileobj' (using os.fstat on its file descriptor). You can
1761 modify some of the TarInfo's attributes before you add it using
1762 addfile(). If given, `arcname' specifies an alternative name for the
1763 file in the archive.
1764 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001765 self._check("awx")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001766
1767 # When fileobj is given, replace name by
1768 # fileobj's real name.
1769 if fileobj is not None:
1770 name = fileobj.name
1771
1772 # Building the name of the member in the archive.
1773 # Backward slashes are converted to forward slashes,
1774 # Absolute paths are turned to relative paths.
1775 if arcname is None:
1776 arcname = name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001777 drv, arcname = os.path.splitdrive(arcname)
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00001778 arcname = arcname.replace(os.sep, "/")
1779 arcname = arcname.lstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001780
1781 # Now, fill the TarInfo object with
1782 # information specific for the file.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001783 tarinfo = self.tarinfo()
1784 tarinfo.tarfile = self
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001785
1786 # Use os.stat or os.lstat, depending on platform
1787 # and if symlinks shall be resolved.
1788 if fileobj is None:
1789 if hasattr(os, "lstat") and not self.dereference:
1790 statres = os.lstat(name)
1791 else:
1792 statres = os.stat(name)
1793 else:
1794 statres = os.fstat(fileobj.fileno())
1795 linkname = ""
1796
1797 stmd = statres.st_mode
1798 if stat.S_ISREG(stmd):
1799 inode = (statres.st_ino, statres.st_dev)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001800 if not self.dereference and statres.st_nlink > 1 and \
1801 inode in self.inodes and arcname != self.inodes[inode]:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001802 # Is it a hardlink to an already
1803 # archived file?
1804 type = LNKTYPE
1805 linkname = self.inodes[inode]
1806 else:
1807 # The inode is added only if its valid.
1808 # For win32 it is always 0.
1809 type = REGTYPE
1810 if inode[0]:
1811 self.inodes[inode] = arcname
1812 elif stat.S_ISDIR(stmd):
1813 type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001814 elif stat.S_ISFIFO(stmd):
1815 type = FIFOTYPE
1816 elif stat.S_ISLNK(stmd):
1817 type = SYMTYPE
1818 linkname = os.readlink(name)
1819 elif stat.S_ISCHR(stmd):
1820 type = CHRTYPE
1821 elif stat.S_ISBLK(stmd):
1822 type = BLKTYPE
1823 else:
1824 return None
1825
1826 # Fill the TarInfo object with all
1827 # information we can get.
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001828 tarinfo.name = arcname
1829 tarinfo.mode = stmd
1830 tarinfo.uid = statres.st_uid
1831 tarinfo.gid = statres.st_gid
Lars Gustäbel2470ff12010-06-03 10:11:52 +00001832 if type == REGTYPE:
Martin v. Löwis61d77e02004-08-20 06:35:46 +00001833 tarinfo.size = statres.st_size
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001834 else:
Guido van Rossume2a383d2007-01-15 16:59:06 +00001835 tarinfo.size = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001836 tarinfo.mtime = statres.st_mtime
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001837 tarinfo.type = type
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001838 tarinfo.linkname = linkname
1839 if pwd:
1840 try:
1841 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1842 except KeyError:
1843 pass
1844 if grp:
1845 try:
1846 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1847 except KeyError:
1848 pass
1849
1850 if type in (CHRTYPE, BLKTYPE):
1851 if hasattr(os, "major") and hasattr(os, "minor"):
1852 tarinfo.devmajor = os.major(statres.st_rdev)
1853 tarinfo.devminor = os.minor(statres.st_rdev)
1854 return tarinfo
1855
Serhiy Storchakaa7eb7462014-08-21 10:01:16 +03001856 def list(self, verbose=True, *, members=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001857 """Print a table of contents to sys.stdout. If `verbose' is False, only
1858 the names of the members are printed. If it is True, an `ls -l'-like
Serhiy Storchakaa7eb7462014-08-21 10:01:16 +03001859 output is produced. `members' is optional and must be a subset of the
1860 list returned by getmembers().
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001861 """
1862 self._check()
1863
Serhiy Storchakaa7eb7462014-08-21 10:01:16 +03001864 if members is None:
1865 members = self
1866 for tarinfo in members:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001867 if verbose:
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001868 _safe_print(stat.filemode(tarinfo.mode))
1869 _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid,
1870 tarinfo.gname or tarinfo.gid))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001871 if tarinfo.ischr() or tarinfo.isblk():
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001872 _safe_print("%10s" %
1873 ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor)))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001874 else:
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001875 _safe_print("%10d" % tarinfo.size)
1876 _safe_print("%d-%02d-%02d %02d:%02d:%02d" \
1877 % time.localtime(tarinfo.mtime)[:6])
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001878
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001879 _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else ""))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001880
1881 if verbose:
1882 if tarinfo.issym():
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001883 _safe_print("-> " + tarinfo.linkname)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001884 if tarinfo.islnk():
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001885 _safe_print("link to " + tarinfo.linkname)
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001886 print()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001887
Raymond Hettingera63a3122011-01-26 20:34:14 +00001888 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001889 """Add the file `name' to the archive. `name' may be any type of file
1890 (directory, fifo, symbolic link, etc.). If given, `arcname'
1891 specifies an alternative name for the file in the archive.
1892 Directories are added recursively by default. This can be avoided by
Guido van Rossum486364b2007-06-30 05:01:58 +00001893 setting `recursive' to False. `exclude' is a function that should
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001894 return True for each filename to be excluded. `filter' is a function
1895 that expects a TarInfo object argument and returns the changed
1896 TarInfo object, if it returns None the TarInfo object will be
1897 excluded from the archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001898 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001899 self._check("awx")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001900
1901 if arcname is None:
1902 arcname = name
1903
Guido van Rossum486364b2007-06-30 05:01:58 +00001904 # Exclude pathnames.
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001905 if exclude is not None:
1906 import warnings
1907 warnings.warn("use the filter argument instead",
1908 DeprecationWarning, 2)
1909 if exclude(name):
1910 self._dbg(2, "tarfile: Excluded %r" % name)
1911 return
Guido van Rossum486364b2007-06-30 05:01:58 +00001912
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001913 # Skip if somebody tries to archive the archive...
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001914 if self.name is not None and os.path.abspath(name) == self.name:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001915 self._dbg(2, "tarfile: Skipped %r" % name)
1916 return
1917
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001918 self._dbg(1, name)
1919
1920 # Create a TarInfo object from the file.
1921 tarinfo = self.gettarinfo(name, arcname)
1922
1923 if tarinfo is None:
1924 self._dbg(1, "tarfile: Unsupported type %r" % name)
1925 return
1926
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001927 # Change or exclude the TarInfo object.
1928 if filter is not None:
1929 tarinfo = filter(tarinfo)
1930 if tarinfo is None:
1931 self._dbg(2, "tarfile: Excluded %r" % name)
1932 return
1933
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001934 # Append the tar header and data to the archive.
1935 if tarinfo.isreg():
Andrew Svetlov718df1d2012-11-29 14:20:47 +02001936 with bltn_open(name, "rb") as f:
1937 self.addfile(tarinfo, f)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001938
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001939 elif tarinfo.isdir():
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001940 self.addfile(tarinfo)
1941 if recursive:
1942 for f in os.listdir(name):
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001943 self.add(os.path.join(name, f), os.path.join(arcname, f),
Raymond Hettingera63a3122011-01-26 20:34:14 +00001944 recursive, exclude, filter=filter)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001945
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001946 else:
1947 self.addfile(tarinfo)
1948
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001949 def addfile(self, tarinfo, fileobj=None):
1950 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
1951 given, tarinfo.size bytes are read from it and added to the archive.
1952 You can create TarInfo objects using gettarinfo().
1953 On Windows platforms, `fileobj' should always be opened with mode
1954 'rb' to avoid irritation about the file size.
1955 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001956 self._check("awx")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001957
Thomas Wouters89f507f2006-12-13 04:49:30 +00001958 tarinfo = copy.copy(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001959
Guido van Rossume7ba4952007-06-06 23:52:48 +00001960 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001961 self.fileobj.write(buf)
1962 self.offset += len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001963
1964 # If there's data to follow, append it.
1965 if fileobj is not None:
1966 copyfileobj(fileobj, self.fileobj, tarinfo.size)
1967 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
1968 if remainder > 0:
1969 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
1970 blocks += 1
1971 self.offset += blocks * BLOCKSIZE
1972
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001973 self.members.append(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001974
Eric V. Smith7a803892015-04-15 10:27:58 -04001975 def extractall(self, path=".", members=None, *, numeric_owner=False):
Martin v. Löwis00a73e72005-03-04 19:40:34 +00001976 """Extract all members from the archive to the current working
1977 directory and set owner, modification time and permissions on
1978 directories afterwards. `path' specifies a different directory
1979 to extract to. `members' is optional and must be a subset of the
Eric V. Smith7a803892015-04-15 10:27:58 -04001980 list returned by getmembers(). If `numeric_owner` is True, only
1981 the numbers for user/group names are used and not the names.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00001982 """
1983 directories = []
1984
1985 if members is None:
1986 members = self
1987
1988 for tarinfo in members:
1989 if tarinfo.isdir():
Christian Heimes2202f872008-02-06 14:31:34 +00001990 # Extract directories with a safe mode.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00001991 directories.append(tarinfo)
Christian Heimes2202f872008-02-06 14:31:34 +00001992 tarinfo = copy.copy(tarinfo)
1993 tarinfo.mode = 0o700
Martin v. Löwis16f344d2010-11-01 21:39:13 +00001994 # Do not set_attrs directories, as we will do that further down
Eric V. Smith7a803892015-04-15 10:27:58 -04001995 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(),
1996 numeric_owner=numeric_owner)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00001997
1998 # Reverse sort directories.
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +00001999 directories.sort(key=lambda a: a.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002000 directories.reverse()
2001
2002 # Set correct owner, mtime and filemode on directories.
2003 for tarinfo in directories:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002004 dirpath = os.path.join(path, tarinfo.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002005 try:
Eric V. Smith7a803892015-04-15 10:27:58 -04002006 self.chown(tarinfo, dirpath, numeric_owner=numeric_owner)
Christian Heimesfaf2f632008-01-06 16:59:19 +00002007 self.utime(tarinfo, dirpath)
2008 self.chmod(tarinfo, dirpath)
Guido van Rossumb940e112007-01-10 16:19:56 +00002009 except ExtractError as e:
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002010 if self.errorlevel > 1:
2011 raise
2012 else:
2013 self._dbg(1, "tarfile: %s" % e)
2014
Eric V. Smith7a803892015-04-15 10:27:58 -04002015 def extract(self, member, path="", set_attrs=True, *, numeric_owner=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002016 """Extract a member from the archive to the current working directory,
2017 using its full name. Its file information is extracted as accurately
2018 as possible. `member' may be a filename or a TarInfo object. You can
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002019 specify a different directory using `path'. File attributes (owner,
Eric V. Smith7a803892015-04-15 10:27:58 -04002020 mtime, mode) are set unless `set_attrs' is False. If `numeric_owner`
2021 is True, only the numbers for user/group names are used and not
2022 the names.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002023 """
2024 self._check("r")
2025
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002026 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002027 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002028 else:
2029 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002030
Neal Norwitza4f651a2004-07-20 22:07:44 +00002031 # Prepare the link target for makelink().
2032 if tarinfo.islnk():
2033 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2034
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002035 try:
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002036 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
Eric V. Smith7a803892015-04-15 10:27:58 -04002037 set_attrs=set_attrs,
2038 numeric_owner=numeric_owner)
Andrew Svetlov3438fa42012-12-17 23:35:18 +02002039 except OSError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002040 if self.errorlevel > 0:
2041 raise
2042 else:
2043 if e.filename is None:
2044 self._dbg(1, "tarfile: %s" % e.strerror)
2045 else:
2046 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
Guido van Rossumb940e112007-01-10 16:19:56 +00002047 except ExtractError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002048 if self.errorlevel > 1:
2049 raise
2050 else:
2051 self._dbg(1, "tarfile: %s" % e)
2052
2053 def extractfile(self, member):
2054 """Extract a member from the archive as a file object. `member' may be
Lars Gustäbel7a919e92012-05-05 18:15:03 +02002055 a filename or a TarInfo object. If `member' is a regular file or a
2056 link, an io.BufferedReader object is returned. Otherwise, None is
2057 returned.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002058 """
2059 self._check("r")
2060
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002061 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002062 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002063 else:
2064 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002065
Lars Gustäbel7a919e92012-05-05 18:15:03 +02002066 if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES:
2067 # Members with unknown types are treated as regular files.
Lars Gustäbelb062a2f2012-05-14 13:18:16 +02002068 return self.fileobject(self, tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002069
2070 elif tarinfo.islnk() or tarinfo.issym():
2071 if isinstance(self.fileobj, _Stream):
2072 # A small but ugly workaround for the case that someone tries
2073 # to extract a (sym)link as a file-object from a non-seekable
2074 # stream of tar blocks.
Thomas Wouters477c8d52006-05-27 19:21:47 +00002075 raise StreamError("cannot extract (sym)link as file object")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002076 else:
Georg Brandl7eb4b7d2005-07-22 21:49:32 +00002077 # A (sym)link's file object is its target's file object.
Lars Gustäbel1b512722010-06-03 12:45:16 +00002078 return self.extractfile(self._find_link_target(tarinfo))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002079 else:
2080 # If there's no data associated with the member (directory, chrdev,
2081 # blkdev, etc.), return None instead of a file object.
2082 return None
2083
Eric V. Smith7a803892015-04-15 10:27:58 -04002084 def _extract_member(self, tarinfo, targetpath, set_attrs=True,
2085 numeric_owner=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002086 """Extract the TarInfo object tarinfo to a physical
2087 file called targetpath.
2088 """
2089 # Fetch the TarInfo object for the given name
2090 # and build the destination pathname, replacing
2091 # forward slashes to platform specific separators.
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002092 targetpath = targetpath.rstrip("/")
2093 targetpath = targetpath.replace("/", os.sep)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002094
2095 # Create all upper directories.
2096 upperdirs = os.path.dirname(targetpath)
2097 if upperdirs and not os.path.exists(upperdirs):
Christian Heimes2202f872008-02-06 14:31:34 +00002098 # Create directories that are not part of the archive with
2099 # default permissions.
Thomas Woutersb2137042007-02-01 18:02:27 +00002100 os.makedirs(upperdirs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002101
2102 if tarinfo.islnk() or tarinfo.issym():
2103 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2104 else:
2105 self._dbg(1, tarinfo.name)
2106
2107 if tarinfo.isreg():
2108 self.makefile(tarinfo, targetpath)
2109 elif tarinfo.isdir():
2110 self.makedir(tarinfo, targetpath)
2111 elif tarinfo.isfifo():
2112 self.makefifo(tarinfo, targetpath)
2113 elif tarinfo.ischr() or tarinfo.isblk():
2114 self.makedev(tarinfo, targetpath)
2115 elif tarinfo.islnk() or tarinfo.issym():
2116 self.makelink(tarinfo, targetpath)
2117 elif tarinfo.type not in SUPPORTED_TYPES:
2118 self.makeunknown(tarinfo, targetpath)
2119 else:
2120 self.makefile(tarinfo, targetpath)
2121
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002122 if set_attrs:
Eric V. Smith7a803892015-04-15 10:27:58 -04002123 self.chown(tarinfo, targetpath, numeric_owner)
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002124 if not tarinfo.issym():
2125 self.chmod(tarinfo, targetpath)
2126 self.utime(tarinfo, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002127
2128 #--------------------------------------------------------------------------
2129 # Below are the different file methods. They are called via
2130 # _extract_member() when extract() is called. They can be replaced in a
2131 # subclass to implement other functionality.
2132
2133 def makedir(self, tarinfo, targetpath):
2134 """Make a directory called targetpath.
2135 """
2136 try:
Christian Heimes2202f872008-02-06 14:31:34 +00002137 # Use a safe mode for the directory, the real mode is set
2138 # later in _extract_member().
2139 os.mkdir(targetpath, 0o700)
Florent Xicluna68f71a32011-10-28 16:06:23 +02002140 except FileExistsError:
2141 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002142
2143 def makefile(self, tarinfo, targetpath):
2144 """Make a file called targetpath.
2145 """
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002146 source = self.fileobj
2147 source.seek(tarinfo.offset_data)
Andrew Svetlov718df1d2012-11-29 14:20:47 +02002148 with bltn_open(targetpath, "wb") as target:
2149 if tarinfo.sparse is not None:
2150 for offset, size in tarinfo.sparse:
2151 target.seek(offset)
2152 copyfileobj(source, target, size)
2153 else:
2154 copyfileobj(source, target, tarinfo.size)
2155 target.seek(tarinfo.size)
2156 target.truncate()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002157
2158 def makeunknown(self, tarinfo, targetpath):
2159 """Make a file from a TarInfo object with an unknown type
2160 at targetpath.
2161 """
2162 self.makefile(tarinfo, targetpath)
2163 self._dbg(1, "tarfile: Unknown file type %r, " \
2164 "extracted as regular file." % tarinfo.type)
2165
2166 def makefifo(self, tarinfo, targetpath):
2167 """Make a fifo called targetpath.
2168 """
2169 if hasattr(os, "mkfifo"):
2170 os.mkfifo(targetpath)
2171 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002172 raise ExtractError("fifo not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002173
2174 def makedev(self, tarinfo, targetpath):
2175 """Make a character or block device called targetpath.
2176 """
2177 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00002178 raise ExtractError("special devices not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002179
2180 mode = tarinfo.mode
2181 if tarinfo.isblk():
2182 mode |= stat.S_IFBLK
2183 else:
2184 mode |= stat.S_IFCHR
2185
2186 os.mknod(targetpath, mode,
2187 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2188
2189 def makelink(self, tarinfo, targetpath):
2190 """Make a (symbolic) link called targetpath. If it cannot be created
2191 (platform limitation), we try to make a copy of the referenced file
2192 instead of a link.
2193 """
Brian Curtind40e6f72010-07-08 21:39:08 +00002194 try:
Lars Gustäbel1b512722010-06-03 12:45:16 +00002195 # For systems that support symbolic and hard links.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002196 if tarinfo.issym():
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002197 os.symlink(tarinfo.linkname, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002198 else:
Neal Norwitza4f651a2004-07-20 22:07:44 +00002199 # See extract().
Lars Gustäbel1b512722010-06-03 12:45:16 +00002200 if os.path.exists(tarinfo._link_target):
2201 os.link(tarinfo._link_target, targetpath)
2202 else:
Brian Curtin82df53e2010-09-24 21:04:05 +00002203 self._extract_member(self._find_link_target(tarinfo),
2204 targetpath)
Brian Curtin16633fa2010-07-09 13:54:27 +00002205 except symlink_exception:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002206 try:
Brian Curtin16633fa2010-07-09 13:54:27 +00002207 self._extract_member(self._find_link_target(tarinfo),
2208 targetpath)
Lars Gustäbel1b512722010-06-03 12:45:16 +00002209 except KeyError:
2210 raise ExtractError("unable to resolve link inside archive")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002211
Eric V. Smith7a803892015-04-15 10:27:58 -04002212 def chown(self, tarinfo, targetpath, numeric_owner):
2213 """Set owner of targetpath according to tarinfo. If numeric_owner
2214 is True, use .gid/.uid instead of .gname/.uname.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002215 """
2216 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2217 # We have to be root to do so.
Eric V. Smith7a803892015-04-15 10:27:58 -04002218 if numeric_owner:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002219 g = tarinfo.gid
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002220 u = tarinfo.uid
Eric V. Smith7a803892015-04-15 10:27:58 -04002221 else:
2222 try:
2223 g = grp.getgrnam(tarinfo.gname)[2]
2224 except KeyError:
2225 g = tarinfo.gid
2226 try:
2227 u = pwd.getpwnam(tarinfo.uname)[2]
2228 except KeyError:
2229 u = tarinfo.uid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002230 try:
2231 if tarinfo.issym() and hasattr(os, "lchown"):
2232 os.lchown(targetpath, u, g)
2233 else:
Jesus Cea4791a242012-10-05 03:15:39 +02002234 os.chown(targetpath, u, g)
Andrew Svetlov3438fa42012-12-17 23:35:18 +02002235 except OSError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002236 raise ExtractError("could not change owner")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002237
2238 def chmod(self, tarinfo, targetpath):
2239 """Set file permissions of targetpath according to tarinfo.
2240 """
Jack Jansen834eff62003-03-07 12:47:06 +00002241 if hasattr(os, 'chmod'):
2242 try:
2243 os.chmod(targetpath, tarinfo.mode)
Andrew Svetlov3438fa42012-12-17 23:35:18 +02002244 except OSError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002245 raise ExtractError("could not change mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002246
2247 def utime(self, tarinfo, targetpath):
2248 """Set modification time of targetpath according to tarinfo.
2249 """
Jack Jansen834eff62003-03-07 12:47:06 +00002250 if not hasattr(os, 'utime'):
Tim Petersf9347782003-03-07 15:36:41 +00002251 return
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002252 try:
2253 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
Andrew Svetlov3438fa42012-12-17 23:35:18 +02002254 except OSError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002255 raise ExtractError("could not change modification time")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002256
2257 #--------------------------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002258 def next(self):
2259 """Return the next member of the archive as a TarInfo object, when
2260 TarFile is opened for reading. Return None if there is no more
2261 available.
2262 """
2263 self._check("ra")
2264 if self.firstmember is not None:
2265 m = self.firstmember
2266 self.firstmember = None
2267 return m
2268
2269 # Read the next block.
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002270 self.fileobj.seek(self.offset)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002271 tarinfo = None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002272 while True:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002273 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00002274 tarinfo = self.tarinfo.fromtarfile(self)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002275 except EOFHeaderError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002276 if self.ignore_zeros:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002277 self._dbg(2, "0x%X: %s" % (self.offset, e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002278 self.offset += BLOCKSIZE
2279 continue
Lars Gustäbel9520a432009-11-22 18:48:49 +00002280 except InvalidHeaderError as e:
2281 if self.ignore_zeros:
2282 self._dbg(2, "0x%X: %s" % (self.offset, e))
2283 self.offset += BLOCKSIZE
2284 continue
2285 elif self.offset == 0:
2286 raise ReadError(str(e))
2287 except EmptyHeaderError:
2288 if self.offset == 0:
2289 raise ReadError("empty file")
2290 except TruncatedHeaderError as e:
2291 if self.offset == 0:
2292 raise ReadError(str(e))
2293 except SubsequentHeaderError as e:
2294 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002295 break
2296
Lars Gustäbel9520a432009-11-22 18:48:49 +00002297 if tarinfo is not None:
2298 self.members.append(tarinfo)
2299 else:
2300 self._loaded = True
2301
Thomas Wouters477c8d52006-05-27 19:21:47 +00002302 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002303
2304 #--------------------------------------------------------------------------
2305 # Little helper methods:
2306
Lars Gustäbel1b512722010-06-03 12:45:16 +00002307 def _getmember(self, name, tarinfo=None, normalize=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002308 """Find an archive member by name from bottom to top.
2309 If tarinfo is given, it is used as the starting point.
2310 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002311 # Ensure that all members have been loaded.
2312 members = self.getmembers()
2313
Lars Gustäbel1b512722010-06-03 12:45:16 +00002314 # Limit the member search list up to tarinfo.
2315 if tarinfo is not None:
2316 members = members[:members.index(tarinfo)]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002317
Lars Gustäbel1b512722010-06-03 12:45:16 +00002318 if normalize:
2319 name = os.path.normpath(name)
2320
2321 for member in reversed(members):
2322 if normalize:
2323 member_name = os.path.normpath(member.name)
2324 else:
2325 member_name = member.name
2326
2327 if name == member_name:
2328 return member
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002329
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002330 def _load(self):
2331 """Read through the entire archive file and look for readable
2332 members.
2333 """
2334 while True:
2335 tarinfo = self.next()
2336 if tarinfo is None:
2337 break
2338 self._loaded = True
2339
2340 def _check(self, mode=None):
2341 """Check if TarFile is still open, and if the operation's mode
2342 corresponds to TarFile's mode.
2343 """
2344 if self.closed:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02002345 raise OSError("%s is closed" % self.__class__.__name__)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002346 if mode is not None and self.mode not in mode:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02002347 raise OSError("bad operation for mode %r" % self.mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002348
Lars Gustäbel1b512722010-06-03 12:45:16 +00002349 def _find_link_target(self, tarinfo):
2350 """Find the target member of a symlink or hardlink member in the
2351 archive.
2352 """
2353 if tarinfo.issym():
2354 # Always search the entire archive.
Lars Gustäbel1ef9eda2012-04-24 21:04:40 +02002355 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
Lars Gustäbel1b512722010-06-03 12:45:16 +00002356 limit = None
2357 else:
2358 # Search the archive before the link, because a hard link is
2359 # just a reference to an already archived file.
2360 linkname = tarinfo.linkname
2361 limit = tarinfo
2362
2363 member = self._getmember(linkname, tarinfo=limit, normalize=True)
2364 if member is None:
2365 raise KeyError("linkname %r not found" % linkname)
2366 return member
2367
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002368 def __iter__(self):
2369 """Provide an iterator object.
2370 """
2371 if self._loaded:
2372 return iter(self.members)
2373 else:
2374 return TarIter(self)
2375
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002376 def _dbg(self, level, msg):
2377 """Write debugging output to sys.stderr.
2378 """
2379 if level <= self.debug:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002380 print(msg, file=sys.stderr)
Lars Gustäbel01385812010-03-03 12:08:54 +00002381
2382 def __enter__(self):
2383 self._check()
2384 return self
2385
2386 def __exit__(self, type, value, traceback):
2387 if type is None:
2388 self.close()
2389 else:
2390 # An exception occurred. We must not call close() because
2391 # it would try to write end-of-archive blocks and padding.
2392 if not self._extfileobj:
2393 self.fileobj.close()
2394 self.closed = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002395# class TarFile
2396
2397class TarIter:
2398 """Iterator Class.
2399
2400 for tarinfo in TarFile(...):
2401 suite...
2402 """
2403
2404 def __init__(self, tarfile):
2405 """Construct a TarIter object.
2406 """
2407 self.tarfile = tarfile
Martin v. Löwis637431b2005-03-03 23:12:42 +00002408 self.index = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002409 def __iter__(self):
2410 """Return iterator object.
2411 """
2412 return self
Georg Brandla18af4e2007-04-21 15:47:16 +00002413 def __next__(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002414 """Return the next item using TarFile's next() method.
2415 When all members have been read, set TarFile as _loaded.
2416 """
Martin v. Löwis637431b2005-03-03 23:12:42 +00002417 # Fix for SF #1100429: Under rare circumstances it can
2418 # happen that getmembers() is called during iteration,
2419 # which will cause TarIter to stop prematurely.
Serhiy Storchaka263fab92013-05-09 14:22:26 +03002420
2421 if self.index == 0 and self.tarfile.firstmember is not None:
2422 tarinfo = self.tarfile.next()
2423 elif self.index < len(self.tarfile.members):
2424 tarinfo = self.tarfile.members[self.index]
2425 elif not self.tarfile._loaded:
Martin v. Löwis637431b2005-03-03 23:12:42 +00002426 tarinfo = self.tarfile.next()
2427 if not tarinfo:
2428 self.tarfile._loaded = True
2429 raise StopIteration
2430 else:
Serhiy Storchaka263fab92013-05-09 14:22:26 +03002431 raise StopIteration
Martin v. Löwis637431b2005-03-03 23:12:42 +00002432 self.index += 1
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002433 return tarinfo
2434
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002435#--------------------
2436# exported functions
2437#--------------------
2438def is_tarfile(name):
2439 """Return True if name points to a tar archive that we
2440 are able to handle, else return False.
2441 """
2442 try:
2443 t = open(name)
2444 t.close()
2445 return True
2446 except TarError:
2447 return False
2448
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002449open = TarFile.open
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002450
2451
2452def main():
2453 import argparse
2454
2455 description = 'A simple command line interface for tarfile module.'
2456 parser = argparse.ArgumentParser(description=description)
2457 parser.add_argument('-v', '--verbose', action='store_true', default=False,
2458 help='Verbose output')
2459 group = parser.add_mutually_exclusive_group()
2460 group.add_argument('-l', '--list', metavar='<tarfile>',
2461 help='Show listing of a tarfile')
2462 group.add_argument('-e', '--extract', nargs='+',
2463 metavar=('<tarfile>', '<output_dir>'),
2464 help='Extract tarfile into target dir')
2465 group.add_argument('-c', '--create', nargs='+',
2466 metavar=('<name>', '<file>'),
2467 help='Create tarfile from sources')
2468 group.add_argument('-t', '--test', metavar='<tarfile>',
2469 help='Test if a tarfile is valid')
2470 args = parser.parse_args()
2471
2472 if args.test:
2473 src = args.test
2474 if is_tarfile(src):
2475 with open(src, 'r') as tar:
2476 tar.getmembers()
2477 print(tar.getmembers(), file=sys.stderr)
2478 if args.verbose:
2479 print('{!r} is a tar archive.'.format(src))
2480 else:
2481 parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2482
2483 elif args.list:
2484 src = args.list
2485 if is_tarfile(src):
2486 with TarFile.open(src, 'r:*') as tf:
2487 tf.list(verbose=args.verbose)
2488 else:
2489 parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2490
2491 elif args.extract:
2492 if len(args.extract) == 1:
2493 src = args.extract[0]
2494 curdir = os.curdir
2495 elif len(args.extract) == 2:
2496 src, curdir = args.extract
2497 else:
2498 parser.exit(1, parser.format_help())
2499
2500 if is_tarfile(src):
2501 with TarFile.open(src, 'r:*') as tf:
2502 tf.extractall(path=curdir)
2503 if args.verbose:
2504 if curdir == '.':
2505 msg = '{!r} file is extracted.'.format(src)
2506 else:
2507 msg = ('{!r} file is extracted '
2508 'into {!r} directory.').format(src, curdir)
2509 print(msg)
2510 else:
2511 parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2512
2513 elif args.create:
2514 tar_name = args.create.pop(0)
2515 _, ext = os.path.splitext(tar_name)
2516 compressions = {
2517 # gz
Serhiy Storchaka832dd5f2015-02-10 08:45:53 +02002518 '.gz': 'gz',
2519 '.tgz': 'gz',
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002520 # xz
Serhiy Storchaka832dd5f2015-02-10 08:45:53 +02002521 '.xz': 'xz',
2522 '.txz': 'xz',
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002523 # bz2
Serhiy Storchaka832dd5f2015-02-10 08:45:53 +02002524 '.bz2': 'bz2',
2525 '.tbz': 'bz2',
2526 '.tbz2': 'bz2',
2527 '.tb2': 'bz2',
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002528 }
2529 tar_mode = 'w:' + compressions[ext] if ext in compressions else 'w'
2530 tar_files = args.create
2531
2532 with TarFile.open(tar_name, tar_mode) as tf:
2533 for file_name in tar_files:
2534 tf.add(file_name)
2535
2536 if args.verbose:
2537 print('{!r} file created.'.format(tar_name))
2538
2539 else:
2540 parser.exit(1, parser.format_help())
2541
2542if __name__ == '__main__':
2543 main()