blob: 960c673067e940330f5605b01e0188bb37aeba2e [file] [log] [blame]
Benjamin Peterson90f5ba52010-03-11 22:53:45 +00001#!/usr/bin/env python3
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
Christian Heimes9c1257e2007-11-04 11:37:22 +00005# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00006# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
Guido van Rossumd8faa362007-04-27 19:54:29 +000032version = "0.9.0"
Guido van Rossum98297ee2007-11-06 21:34:58 +000033__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)"
Senthil Kumaran7c9719c2011-07-28 22:32:49 +080034__date__ = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $"
35__cvsid__ = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $"
Guido van Rossum98297ee2007-11-06 21:34:58 +000036__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000037
38#---------
39# Imports
40#---------
Serhiy Storchakacf4a2f22015-03-11 17:18:03 +020041from builtins import open as bltn_open
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000042import sys
43import os
Eli Bendersky74c503b2012-01-03 06:26:13 +020044import io
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000045import shutil
46import stat
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000047import time
48import struct
Thomas Wouters89f507f2006-12-13 04:49:30 +000049import copy
Guido van Rossumd8faa362007-04-27 19:54:29 +000050import re
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000051
52try:
53 import grp, pwd
Brett Cannoncd171c82013-07-04 17:43:24 -040054except ImportError:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000055 grp = pwd = None
56
Brian Curtin16633fa2010-07-09 13:54:27 +000057# os.symlink on Windows prior to 6.0 raises NotImplementedError
58symlink_exception = (AttributeError, NotImplementedError)
59try:
Andrew Svetlov2606a6f2012-12-19 14:33:35 +020060 # OSError (winerror=1314) will be raised if the caller does not hold the
Brian Curtin16633fa2010-07-09 13:54:27 +000061 # SeCreateSymbolicLinkPrivilege privilege
Andrew Svetlov2606a6f2012-12-19 14:33:35 +020062 symlink_exception += (OSError,)
Brian Curtin16633fa2010-07-09 13:54:27 +000063except NameError:
64 pass
65
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000066# from tarfile import *
Martin Panter104dcda2016-01-16 06:59:13 +000067__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError", "ReadError",
68 "CompressionError", "StreamError", "ExtractError", "HeaderError",
69 "ENCODING", "USTAR_FORMAT", "GNU_FORMAT", "PAX_FORMAT",
70 "DEFAULT_FORMAT", "open"]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000071
72#---------------------------------------------------------
73# tar constants
74#---------------------------------------------------------
Lars Gustäbelb506dc32007-08-07 18:36:16 +000075NUL = b"\0" # the null character
Guido van Rossumd8faa362007-04-27 19:54:29 +000076BLOCKSIZE = 512 # length of processing blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000077RECORDSIZE = BLOCKSIZE * 20 # length of records
Lars Gustäbelb506dc32007-08-07 18:36:16 +000078GNU_MAGIC = b"ustar \0" # magic gnu tar string
79POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000080
Guido van Rossumd8faa362007-04-27 19:54:29 +000081LENGTH_NAME = 100 # maximum length of a filename
82LENGTH_LINK = 100 # maximum length of a linkname
83LENGTH_PREFIX = 155 # maximum length of the prefix field
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000084
Lars Gustäbelb506dc32007-08-07 18:36:16 +000085REGTYPE = b"0" # regular file
86AREGTYPE = b"\0" # regular file
87LNKTYPE = b"1" # link (inside tarfile)
88SYMTYPE = b"2" # symbolic link
89CHRTYPE = b"3" # character special device
90BLKTYPE = b"4" # block special device
91DIRTYPE = b"5" # directory
92FIFOTYPE = b"6" # fifo special device
93CONTTYPE = b"7" # contiguous file
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000094
Lars Gustäbelb506dc32007-08-07 18:36:16 +000095GNUTYPE_LONGNAME = b"L" # GNU tar longname
96GNUTYPE_LONGLINK = b"K" # GNU tar longlink
97GNUTYPE_SPARSE = b"S" # GNU tar sparse file
Guido van Rossumd8faa362007-04-27 19:54:29 +000098
Lars Gustäbelb506dc32007-08-07 18:36:16 +000099XHDTYPE = b"x" # POSIX.1-2001 extended header
100XGLTYPE = b"g" # POSIX.1-2001 global header
101SOLARIS_XHDTYPE = b"X" # Solaris extended header
Guido van Rossumd8faa362007-04-27 19:54:29 +0000102
103USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
104GNU_FORMAT = 1 # GNU tar format
105PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
106DEFAULT_FORMAT = GNU_FORMAT
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000107
108#---------------------------------------------------------
109# tarfile constants
110#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000111# File types that tarfile supports:
112SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
113 SYMTYPE, DIRTYPE, FIFOTYPE,
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000114 CONTTYPE, CHRTYPE, BLKTYPE,
115 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
116 GNUTYPE_SPARSE)
117
Guido van Rossumd8faa362007-04-27 19:54:29 +0000118# File types that will be treated as a regular file.
119REGULAR_TYPES = (REGTYPE, AREGTYPE,
120 CONTTYPE, GNUTYPE_SPARSE)
121
122# File types that are part of the GNU tar format.
123GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
124 GNUTYPE_SPARSE)
125
126# Fields from a pax header that override a TarInfo attribute.
127PAX_FIELDS = ("path", "linkpath", "size", "mtime",
128 "uid", "gid", "uname", "gname")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000129
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000130# Fields from a pax header that are affected by hdrcharset.
131PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
132
Guido van Rossume7ba4952007-06-06 23:52:48 +0000133# Fields in a pax header that are numbers, all other fields
134# are treated as strings.
135PAX_NUMBER_FIELDS = {
136 "atime": float,
137 "ctime": float,
138 "mtime": float,
139 "uid": int,
140 "gid": int,
141 "size": int
142}
143
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000144#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000145# initialization
146#---------------------------------------------------------
Larry Hastings10108a72016-09-05 15:11:23 -0700147if os.name == "nt":
Victor Stinner0f35e2c2010-06-11 23:46:47 +0000148 ENCODING = "utf-8"
149else:
150 ENCODING = sys.getfilesystemencoding()
Guido van Rossumd8faa362007-04-27 19:54:29 +0000151
152#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000153# Some useful functions
154#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000155
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000156def stn(s, length, encoding, errors):
157 """Convert a string to a null-terminated bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000158 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000159 s = s.encode(encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000160 return s[:length] + (length - len(s)) * NUL
Thomas Wouters477c8d52006-05-27 19:21:47 +0000161
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000162def nts(s, encoding, errors):
163 """Convert a null-terminated bytes object to a string.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000164 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000165 p = s.find(b"\0")
166 if p != -1:
167 s = s[:p]
168 return s.decode(encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000169
Thomas Wouters477c8d52006-05-27 19:21:47 +0000170def nti(s):
171 """Convert a number field to a python number.
172 """
173 # There are two possible encodings for a number field, see
174 # itn() below.
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200175 if s[0] in (0o200, 0o377):
176 n = 0
177 for i in range(len(s) - 1):
178 n <<= 8
179 n += s[i + 1]
180 if s[0] == 0o377:
181 n = -(256 ** (len(s) - 1) - n)
182 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000183 try:
Lars Gustäbelb7a688b2015-07-02 19:38:38 +0200184 s = nts(s, "ascii", "strict")
185 n = int(s.strip() or "0", 8)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000186 except ValueError:
Lars Gustäbel9520a432009-11-22 18:48:49 +0000187 raise InvalidHeaderError("invalid header")
Thomas Wouters477c8d52006-05-27 19:21:47 +0000188 return n
189
Guido van Rossumd8faa362007-04-27 19:54:29 +0000190def itn(n, digits=8, format=DEFAULT_FORMAT):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000191 """Convert a python number to a number field.
192 """
193 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
194 # octal digits followed by a null-byte, this allows values up to
195 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200196 # that if necessary. A leading 0o200 or 0o377 byte indicate this
197 # particular encoding, the following digits-1 bytes are a big-endian
198 # base-256 representation. This allows values up to (256**(digits-1))-1.
199 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
200 # number.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000201 if 0 <= n < 8 ** (digits - 1):
Ethan Furmandf3ed242014-01-05 06:50:30 -0800202 s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200203 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
204 if n >= 0:
205 s = bytearray([0o200])
206 else:
207 s = bytearray([0o377])
208 n = 256 ** digits + n
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209
Guido van Rossum805365e2007-05-07 22:24:25 +0000210 for i in range(digits - 1):
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200211 s.insert(1, n & 0o377)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000212 n >>= 8
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200213 else:
214 raise ValueError("overflow in number field")
215
Thomas Wouters477c8d52006-05-27 19:21:47 +0000216 return s
217
218def calc_chksums(buf):
219 """Calculate the checksum for a member's header by summing up all
220 characters except for the chksum field which is treated as if
221 it was filled with spaces. According to the GNU tar sources,
222 some tars (Sun and NeXT) calculate chksum with signed char,
223 which will be different if there are chars in the buffer with
224 the high bit set. So we calculate two checksums, unsigned and
225 signed.
226 """
Ross Lagerwall468ff4c2012-05-17 19:49:27 +0200227 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
228 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000229 return unsigned_chksum, signed_chksum
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000230
Lars Gustäbel03572682015-07-06 09:27:24 +0200231def copyfileobj(src, dst, length=None, exception=OSError):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000232 """Copy length bytes from fileobj src to fileobj dst.
233 If length is None, copy the entire content.
234 """
235 if length == 0:
236 return
237 if length is None:
238 shutil.copyfileobj(src, dst)
239 return
240
241 BUFSIZE = 16 * 1024
242 blocks, remainder = divmod(length, BUFSIZE)
Guido van Rossum805365e2007-05-07 22:24:25 +0000243 for b in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000244 buf = src.read(BUFSIZE)
245 if len(buf) < BUFSIZE:
Lars Gustäbel03572682015-07-06 09:27:24 +0200246 raise exception("unexpected end of data")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000247 dst.write(buf)
248
249 if remainder != 0:
250 buf = src.read(remainder)
251 if len(buf) < remainder:
Lars Gustäbel03572682015-07-06 09:27:24 +0200252 raise exception("unexpected end of data")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000253 dst.write(buf)
254 return
255
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000256def filemode(mode):
Giampaolo Rodola'ffa1d0b2012-05-15 15:30:25 +0200257 """Deprecated in this location; use stat.filemode."""
258 import warnings
259 warnings.warn("deprecated in favor of stat.filemode",
260 DeprecationWarning, 2)
261 return stat.filemode(mode)
262
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +0200263def _safe_print(s):
264 encoding = getattr(sys.stdout, 'encoding', None)
265 if encoding is not None:
266 s = s.encode(encoding, 'backslashreplace').decode(encoding)
267 print(s, end=' ')
268
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000269
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000270class TarError(Exception):
271 """Base exception."""
272 pass
273class ExtractError(TarError):
274 """General exception for extract errors."""
275 pass
276class ReadError(TarError):
Ezio Melotti30b9d5d2013-08-17 15:50:46 +0300277 """Exception for unreadable tar archives."""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000278 pass
279class CompressionError(TarError):
280 """Exception for unavailable compression methods."""
281 pass
282class StreamError(TarError):
283 """Exception for unsupported operations on stream-like TarFiles."""
284 pass
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000285class HeaderError(TarError):
Lars Gustäbel9520a432009-11-22 18:48:49 +0000286 """Base exception for header errors."""
287 pass
288class EmptyHeaderError(HeaderError):
289 """Exception for empty headers."""
290 pass
291class TruncatedHeaderError(HeaderError):
292 """Exception for truncated headers."""
293 pass
294class EOFHeaderError(HeaderError):
295 """Exception for end of file headers."""
296 pass
297class InvalidHeaderError(HeaderError):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000298 """Exception for invalid headers."""
299 pass
Lars Gustäbel9520a432009-11-22 18:48:49 +0000300class SubsequentHeaderError(HeaderError):
301 """Exception for missing and invalid extended headers."""
302 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000303
304#---------------------------
305# internal stream interface
306#---------------------------
307class _LowLevelFile:
308 """Low-level file object. Supports reading and writing.
309 It is used instead of a regular file object for streaming
310 access.
311 """
312
313 def __init__(self, name, mode):
314 mode = {
315 "r": os.O_RDONLY,
316 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
317 }[mode]
318 if hasattr(os, "O_BINARY"):
319 mode |= os.O_BINARY
Lars Gustäbeld6eb70b2010-04-29 15:37:02 +0000320 self.fd = os.open(name, mode, 0o666)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000321
322 def close(self):
323 os.close(self.fd)
324
325 def read(self, size):
326 return os.read(self.fd, size)
327
328 def write(self, s):
329 os.write(self.fd, s)
330
331class _Stream:
332 """Class that serves as an adapter between TarFile and
333 a stream-like object. The stream-like object only
334 needs to have a read() or write() method and is accessed
335 blockwise. Use of gzip or bzip2 compression is possible.
336 A stream-like object could be for example: sys.stdin,
337 sys.stdout, a socket, a tape device etc.
338
339 _Stream is intended to be used only internally.
340 """
341
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000342 def __init__(self, name, mode, comptype, fileobj, bufsize):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000343 """Construct a _Stream object.
344 """
345 self._extfileobj = True
346 if fileobj is None:
347 fileobj = _LowLevelFile(name, mode)
348 self._extfileobj = False
349
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000350 if comptype == '*':
351 # Enable transparent compression detection for the
352 # stream interface
353 fileobj = _StreamProxy(fileobj)
354 comptype = fileobj.getcomptype()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000355
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000356 self.name = name or ""
357 self.mode = mode
358 self.comptype = comptype
359 self.fileobj = fileobj
360 self.bufsize = bufsize
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000361 self.buf = b""
Guido van Rossume2a383d2007-01-15 16:59:06 +0000362 self.pos = 0
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000363 self.closed = False
364
Antoine Pitrou605c2932010-09-23 20:15:14 +0000365 try:
366 if comptype == "gz":
367 try:
368 import zlib
Brett Cannoncd171c82013-07-04 17:43:24 -0400369 except ImportError:
Antoine Pitrou605c2932010-09-23 20:15:14 +0000370 raise CompressionError("zlib module is not available")
371 self.zlib = zlib
372 self.crc = zlib.crc32(b"")
373 if mode == "r":
374 self._init_read_gz()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100375 self.exception = zlib.error
Antoine Pitrou605c2932010-09-23 20:15:14 +0000376 else:
377 self._init_write_gz()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000378
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100379 elif comptype == "bz2":
Antoine Pitrou605c2932010-09-23 20:15:14 +0000380 try:
381 import bz2
Brett Cannoncd171c82013-07-04 17:43:24 -0400382 except ImportError:
Antoine Pitrou605c2932010-09-23 20:15:14 +0000383 raise CompressionError("bz2 module is not available")
384 if mode == "r":
385 self.dbuf = b""
386 self.cmp = bz2.BZ2Decompressor()
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200387 self.exception = OSError
Antoine Pitrou605c2932010-09-23 20:15:14 +0000388 else:
389 self.cmp = bz2.BZ2Compressor()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100390
391 elif comptype == "xz":
392 try:
393 import lzma
Brett Cannoncd171c82013-07-04 17:43:24 -0400394 except ImportError:
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100395 raise CompressionError("lzma module is not available")
396 if mode == "r":
397 self.dbuf = b""
398 self.cmp = lzma.LZMADecompressor()
399 self.exception = lzma.LZMAError
400 else:
401 self.cmp = lzma.LZMACompressor()
402
403 elif comptype != "tar":
404 raise CompressionError("unknown compression type %r" % comptype)
405
Antoine Pitrou605c2932010-09-23 20:15:14 +0000406 except:
407 if not self._extfileobj:
408 self.fileobj.close()
409 self.closed = True
410 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000411
412 def __del__(self):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000413 if hasattr(self, "closed") and not self.closed:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000414 self.close()
415
416 def _init_write_gz(self):
417 """Initialize for writing with gzip compression.
418 """
419 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
420 -self.zlib.MAX_WBITS,
421 self.zlib.DEF_MEM_LEVEL,
422 0)
Guido van Rossume2a383d2007-01-15 16:59:06 +0000423 timestamp = struct.pack("<L", int(time.time()))
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000424 self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000425 if self.name.endswith(".gz"):
426 self.name = self.name[:-3]
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000427 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
428 self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000429
430 def write(self, s):
431 """Write string s to the stream.
432 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000433 if self.comptype == "gz":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000434 self.crc = self.zlib.crc32(s, self.crc)
435 self.pos += len(s)
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000436 if self.comptype != "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000437 s = self.cmp.compress(s)
438 self.__write(s)
439
440 def __write(self, s):
441 """Write string s to the stream if a whole new block
442 is ready to be written.
443 """
444 self.buf += s
445 while len(self.buf) > self.bufsize:
446 self.fileobj.write(self.buf[:self.bufsize])
447 self.buf = self.buf[self.bufsize:]
448
449 def close(self):
450 """Close the _Stream object. No operation should be
451 done on it afterwards.
452 """
453 if self.closed:
454 return
455
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000456 self.closed = True
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300457 try:
458 if self.mode == "w" and self.comptype != "tar":
459 self.buf += self.cmp.flush()
460
461 if self.mode == "w" and self.buf:
462 self.fileobj.write(self.buf)
463 self.buf = b""
464 if self.comptype == "gz":
Martin Panterb82032f2015-12-11 05:19:29 +0000465 self.fileobj.write(struct.pack("<L", self.crc))
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300466 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
467 finally:
468 if not self._extfileobj:
469 self.fileobj.close()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000470
471 def _init_read_gz(self):
472 """Initialize for reading a gzip compressed fileobj.
473 """
474 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000475 self.dbuf = b""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000476
477 # taken from gzip.GzipFile with some alterations
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000478 if self.__read(2) != b"\037\213":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000479 raise ReadError("not a gzip file")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000480 if self.__read(1) != b"\010":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000481 raise CompressionError("unsupported compression method")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000482
483 flag = ord(self.__read(1))
484 self.__read(6)
485
486 if flag & 4:
487 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
488 self.read(xlen)
489 if flag & 8:
490 while True:
491 s = self.__read(1)
492 if not s or s == NUL:
493 break
494 if flag & 16:
495 while True:
496 s = self.__read(1)
497 if not s or s == NUL:
498 break
499 if flag & 2:
500 self.__read(2)
501
502 def tell(self):
503 """Return the stream's file pointer position.
504 """
505 return self.pos
506
507 def seek(self, pos=0):
508 """Set the stream's file pointer to pos. Negative seeking
509 is forbidden.
510 """
511 if pos - self.pos >= 0:
512 blocks, remainder = divmod(pos - self.pos, self.bufsize)
Guido van Rossum805365e2007-05-07 22:24:25 +0000513 for i in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000514 self.read(self.bufsize)
515 self.read(remainder)
516 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000517 raise StreamError("seeking backwards is not allowed")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000518 return self.pos
519
520 def read(self, size=None):
521 """Return the next size number of bytes from the stream.
522 If size is not defined, return all bytes of the stream
523 up to EOF.
524 """
525 if size is None:
526 t = []
527 while True:
528 buf = self._read(self.bufsize)
529 if not buf:
530 break
531 t.append(buf)
532 buf = "".join(t)
533 else:
534 buf = self._read(size)
535 self.pos += len(buf)
536 return buf
537
538 def _read(self, size):
539 """Return size bytes from the stream.
540 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000541 if self.comptype == "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000542 return self.__read(size)
543
544 c = len(self.dbuf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000545 while c < size:
546 buf = self.__read(self.bufsize)
547 if not buf:
548 break
Guido van Rossumd8faa362007-04-27 19:54:29 +0000549 try:
550 buf = self.cmp.decompress(buf)
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100551 except self.exception:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000552 raise ReadError("invalid compressed data")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000553 self.dbuf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000554 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000555 buf = self.dbuf[:size]
556 self.dbuf = self.dbuf[size:]
557 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000558
559 def __read(self, size):
560 """Return size bytes from stream. If internal buffer is empty,
561 read another block from the stream.
562 """
563 c = len(self.buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000564 while c < size:
565 buf = self.fileobj.read(self.bufsize)
566 if not buf:
567 break
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000568 self.buf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000569 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000570 buf = self.buf[:size]
571 self.buf = self.buf[size:]
572 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000573# class _Stream
574
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000575class _StreamProxy(object):
576 """Small proxy class that enables transparent compression
577 detection for the Stream interface (mode 'r|*').
578 """
579
580 def __init__(self, fileobj):
581 self.fileobj = fileobj
582 self.buf = self.fileobj.read(BLOCKSIZE)
583
584 def read(self, size):
585 self.read = self.fileobj.read
586 return self.buf
587
588 def getcomptype(self):
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100589 if self.buf.startswith(b"\x1f\x8b\x08"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000590 return "gz"
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100591 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000592 return "bz2"
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100593 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
594 return "xz"
595 else:
596 return "tar"
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000597
598 def close(self):
599 self.fileobj.close()
600# class StreamProxy
601
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000602#------------------------
603# Extraction file object
604#------------------------
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000605class _FileInFile(object):
606 """A thin wrapper around an existing file object that
607 provides a part of its data as an individual file
608 object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000609 """
610
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000611 def __init__(self, fileobj, offset, size, blockinfo=None):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000612 self.fileobj = fileobj
613 self.offset = offset
614 self.size = size
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000615 self.position = 0
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200616 self.name = getattr(fileobj, "name", None)
617 self.closed = False
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000618
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000619 if blockinfo is None:
620 blockinfo = [(0, size)]
621
622 # Construct a map with data and zero blocks.
623 self.map_index = 0
624 self.map = []
625 lastpos = 0
626 realpos = self.offset
627 for offset, size in blockinfo:
628 if offset > lastpos:
629 self.map.append((False, lastpos, offset, None))
630 self.map.append((True, offset, offset + size, realpos))
631 realpos += size
632 lastpos = offset + size
633 if lastpos < self.size:
634 self.map.append((False, lastpos, self.size, None))
635
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200636 def flush(self):
637 pass
638
639 def readable(self):
640 return True
641
642 def writable(self):
643 return False
644
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000645 def seekable(self):
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000646 return self.fileobj.seekable()
647
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000648 def tell(self):
649 """Return the current file position.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000650 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000651 return self.position
652
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200653 def seek(self, position, whence=io.SEEK_SET):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000654 """Seek to a position in the file.
655 """
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200656 if whence == io.SEEK_SET:
657 self.position = min(max(position, 0), self.size)
658 elif whence == io.SEEK_CUR:
659 if position < 0:
660 self.position = max(self.position + position, 0)
661 else:
662 self.position = min(self.position + position, self.size)
663 elif whence == io.SEEK_END:
664 self.position = max(min(self.size + position, self.size), 0)
665 else:
666 raise ValueError("Invalid argument")
667 return self.position
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000668
669 def read(self, size=None):
670 """Read data from the file.
671 """
672 if size is None:
673 size = self.size - self.position
674 else:
675 size = min(size, self.size - self.position)
676
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000677 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000678 while size > 0:
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000679 while True:
680 data, start, stop, offset = self.map[self.map_index]
681 if start <= self.position < stop:
682 break
683 else:
684 self.map_index += 1
685 if self.map_index == len(self.map):
686 self.map_index = 0
687 length = min(size, stop - self.position)
688 if data:
Lars Gustäbeldd071042011-02-23 11:42:22 +0000689 self.fileobj.seek(offset + (self.position - start))
Lars Gustäbel03572682015-07-06 09:27:24 +0200690 b = self.fileobj.read(length)
691 if len(b) != length:
692 raise ReadError("unexpected end of data")
693 buf += b
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000694 else:
695 buf += NUL * length
696 size -= length
697 self.position += length
698 return buf
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000699
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200700 def readinto(self, b):
701 buf = self.read(len(b))
702 b[:len(buf)] = buf
703 return len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000704
705 def close(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000706 self.closed = True
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200707#class _FileInFile
Martin v. Löwisdf241532005-03-03 08:17:42 +0000708
Lars Gustäbelb062a2f2012-05-14 13:18:16 +0200709class ExFileObject(io.BufferedReader):
710
711 def __init__(self, tarfile, tarinfo):
712 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
713 tarinfo.size, tarinfo.sparse)
714 super().__init__(fileobj)
715#class ExFileObject
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000716
717#------------------
718# Exported Classes
719#------------------
720class TarInfo(object):
721 """Informational class which holds the details about an
722 archive member given by a tar header block.
723 TarInfo objects are returned by TarFile.getmember(),
724 TarFile.getmembers() and TarFile.gettarinfo() and are
725 usually created internally.
726 """
727
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000728 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
729 "chksum", "type", "linkname", "uname", "gname",
730 "devmajor", "devminor",
731 "offset", "offset_data", "pax_headers", "sparse",
732 "tarfile", "_sparse_structs", "_link_target")
733
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000734 def __init__(self, name=""):
735 """Construct a TarInfo object. name is the optional name
736 of the member.
737 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000738 self.name = name # member name
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000739 self.mode = 0o644 # file permissions
Thomas Wouters477c8d52006-05-27 19:21:47 +0000740 self.uid = 0 # user id
741 self.gid = 0 # group id
742 self.size = 0 # file size
743 self.mtime = 0 # modification time
744 self.chksum = 0 # header checksum
745 self.type = REGTYPE # member type
746 self.linkname = "" # link name
Lars Gustäbel331b8002010-10-04 15:18:47 +0000747 self.uname = "" # user name
748 self.gname = "" # group name
Thomas Wouters477c8d52006-05-27 19:21:47 +0000749 self.devmajor = 0 # device major number
750 self.devminor = 0 # device minor number
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000751
Thomas Wouters477c8d52006-05-27 19:21:47 +0000752 self.offset = 0 # the tar header starts here
753 self.offset_data = 0 # the file's data starts here
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000754
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000755 self.sparse = None # sparse member information
Guido van Rossumd8faa362007-04-27 19:54:29 +0000756 self.pax_headers = {} # pax header information
757
758 # In pax headers the "name" and "linkname" field are called
759 # "path" and "linkpath".
760 def _getpath(self):
761 return self.name
762 def _setpath(self, name):
763 self.name = name
764 path = property(_getpath, _setpath)
765
766 def _getlinkpath(self):
767 return self.linkname
768 def _setlinkpath(self, linkname):
769 self.linkname = linkname
770 linkpath = property(_getlinkpath, _setlinkpath)
771
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000772 def __repr__(self):
773 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
774
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000775 def get_info(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000776 """Return the TarInfo's attributes as a dictionary.
777 """
778 info = {
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000779 "name": self.name,
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000780 "mode": self.mode & 0o7777,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000781 "uid": self.uid,
782 "gid": self.gid,
783 "size": self.size,
784 "mtime": self.mtime,
785 "chksum": self.chksum,
786 "type": self.type,
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000787 "linkname": self.linkname,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000788 "uname": self.uname,
789 "gname": self.gname,
790 "devmajor": self.devmajor,
791 "devminor": self.devminor
792 }
793
794 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
795 info["name"] += "/"
796
797 return info
798
Victor Stinnerde629d42010-05-05 21:43:57 +0000799 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000800 """Return a tar header as a string of 512 byte blocks.
801 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000802 info = self.get_info()
Guido van Rossume7ba4952007-06-06 23:52:48 +0000803
Guido van Rossumd8faa362007-04-27 19:54:29 +0000804 if format == USTAR_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000805 return self.create_ustar_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000806 elif format == GNU_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000807 return self.create_gnu_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000808 elif format == PAX_FORMAT:
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000809 return self.create_pax_header(info, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000810 else:
811 raise ValueError("invalid format")
812
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000813 def create_ustar_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000814 """Return the object as a ustar header block.
815 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000816 info["magic"] = POSIX_MAGIC
817
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200818 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000819 raise ValueError("linkname is too long")
820
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200821 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
822 info["prefix"], info["name"] = self._posix_split_name(info["name"], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000823
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000824 return self._create_header(info, USTAR_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000825
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000826 def create_gnu_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000827 """Return the object as a GNU header block sequence.
828 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000829 info["magic"] = GNU_MAGIC
830
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000831 buf = b""
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200832 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000833 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000834
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200835 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000836 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000837
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000838 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000839
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000840 def create_pax_header(self, info, encoding):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000841 """Return the object as a ustar header block. If it cannot be
842 represented this way, prepend a pax extended header sequence
843 with supplement information.
844 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000845 info["magic"] = POSIX_MAGIC
846 pax_headers = self.pax_headers.copy()
847
848 # Test string fields for values that exceed the field length or cannot
849 # be represented in ASCII encoding.
850 for name, hname, length in (
851 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
852 ("uname", "uname", 32), ("gname", "gname", 32)):
853
Guido van Rossume7ba4952007-06-06 23:52:48 +0000854 if hname in pax_headers:
855 # The pax header has priority.
856 continue
857
Guido van Rossumd8faa362007-04-27 19:54:29 +0000858 # Try to encode the string as ASCII.
859 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000860 info[name].encode("ascii", "strict")
Guido van Rossumd8faa362007-04-27 19:54:29 +0000861 except UnicodeEncodeError:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000862 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +0000863 continue
864
Guido van Rossume7ba4952007-06-06 23:52:48 +0000865 if len(info[name]) > length:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000866 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +0000867
868 # Test number fields for values that exceed the field limit or values
869 # that like to be stored as float.
870 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
Guido van Rossume7ba4952007-06-06 23:52:48 +0000871 if name in pax_headers:
872 # The pax header has priority. Avoid overflow.
873 info[name] = 0
874 continue
875
Guido van Rossumd8faa362007-04-27 19:54:29 +0000876 val = info[name]
877 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000878 pax_headers[name] = str(val)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000879 info[name] = 0
880
Guido van Rossume7ba4952007-06-06 23:52:48 +0000881 # Create a pax extended header if necessary.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000882 if pax_headers:
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000883 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000884 else:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000885 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +0000886
Lars Gustäbel3741eff2007-08-21 12:17:05 +0000887 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
Guido van Rossumd8faa362007-04-27 19:54:29 +0000888
889 @classmethod
Lars Gustäbel3741eff2007-08-21 12:17:05 +0000890 def create_pax_global_header(cls, pax_headers):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000891 """Return the object as a pax global header block sequence.
892 """
Marc-André Lemburg8f36af72011-02-25 15:42:01 +0000893 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
Guido van Rossumd8faa362007-04-27 19:54:29 +0000894
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200895 def _posix_split_name(self, name, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000896 """Split a name longer than 100 chars into a prefix
897 and a name part.
898 """
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200899 components = name.split("/")
900 for i in range(1, len(components)):
901 prefix = "/".join(components[:i])
902 name = "/".join(components[i:])
903 if len(prefix.encode(encoding, errors)) <= LENGTH_PREFIX and \
904 len(name.encode(encoding, errors)) <= LENGTH_NAME:
905 break
906 else:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000907 raise ValueError("name is too long")
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200908
Guido van Rossumd8faa362007-04-27 19:54:29 +0000909 return prefix, name
910
911 @staticmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000912 def _create_header(info, format, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000913 """Return a header block. info is a dictionary with file
914 information, format must be one of the *_FORMAT constants.
915 """
916 parts = [
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000917 stn(info.get("name", ""), 100, encoding, errors),
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000918 itn(info.get("mode", 0) & 0o7777, 8, format),
Guido van Rossumd8faa362007-04-27 19:54:29 +0000919 itn(info.get("uid", 0), 8, format),
920 itn(info.get("gid", 0), 8, format),
921 itn(info.get("size", 0), 12, format),
922 itn(info.get("mtime", 0), 12, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000923 b" ", # checksum field
Guido van Rossumd8faa362007-04-27 19:54:29 +0000924 info.get("type", REGTYPE),
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000925 stn(info.get("linkname", ""), 100, encoding, errors),
926 info.get("magic", POSIX_MAGIC),
Lars Gustäbel331b8002010-10-04 15:18:47 +0000927 stn(info.get("uname", ""), 32, encoding, errors),
928 stn(info.get("gname", ""), 32, encoding, errors),
Guido van Rossumd8faa362007-04-27 19:54:29 +0000929 itn(info.get("devmajor", 0), 8, format),
930 itn(info.get("devminor", 0), 8, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000931 stn(info.get("prefix", ""), 155, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000932 ]
933
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000934 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
Guido van Rossumd8faa362007-04-27 19:54:29 +0000935 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
Lars Gustäbela280ca752007-08-28 07:34:33 +0000936 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
Guido van Rossumd8faa362007-04-27 19:54:29 +0000937 return buf
938
939 @staticmethod
940 def _create_payload(payload):
941 """Return the string payload filled with zero bytes
942 up to the next 512 byte border.
943 """
944 blocks, remainder = divmod(len(payload), BLOCKSIZE)
945 if remainder > 0:
946 payload += (BLOCKSIZE - remainder) * NUL
947 return payload
948
949 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000950 def _create_gnu_long_header(cls, name, type, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000951 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
952 for name.
953 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000954 name = name.encode(encoding, errors) + NUL
Guido van Rossumd8faa362007-04-27 19:54:29 +0000955
956 info = {}
957 info["name"] = "././@LongLink"
958 info["type"] = type
959 info["size"] = len(name)
960 info["magic"] = GNU_MAGIC
961
962 # create extended header + name blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000963 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
Guido van Rossumd8faa362007-04-27 19:54:29 +0000964 cls._create_payload(name)
965
966 @classmethod
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000967 def _create_pax_generic_header(cls, pax_headers, type, encoding):
968 """Return a POSIX.1-2008 extended or global header sequence
Guido van Rossumd8faa362007-04-27 19:54:29 +0000969 that contains a list of keyword, value pairs. The values
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000970 must be strings.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000971 """
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000972 # Check if one of the fields contains surrogate characters and thereby
973 # forces hdrcharset=BINARY, see _proc_pax() for more information.
974 binary = False
975 for keyword, value in pax_headers.items():
976 try:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +0000977 value.encode("utf-8", "strict")
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000978 except UnicodeEncodeError:
979 binary = True
980 break
981
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000982 records = b""
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000983 if binary:
984 # Put the hdrcharset field at the beginning of the header.
985 records += b"21 hdrcharset=BINARY\n"
986
Guido van Rossumd8faa362007-04-27 19:54:29 +0000987 for keyword, value in pax_headers.items():
Marc-André Lemburg8f36af72011-02-25 15:42:01 +0000988 keyword = keyword.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000989 if binary:
990 # Try to restore the original byte representation of `value'.
991 # Needless to say, that the encoding must match the string.
992 value = value.encode(encoding, "surrogateescape")
993 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +0000994 value = value.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000995
Guido van Rossumd8faa362007-04-27 19:54:29 +0000996 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
997 n = p = 0
998 while True:
999 n = l + len(str(p))
1000 if n == p:
1001 break
1002 p = n
Lars Gustäbela280ca752007-08-28 07:34:33 +00001003 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
Guido van Rossumd8faa362007-04-27 19:54:29 +00001004
1005 # We use a hardcoded "././@PaxHeader" name like star does
1006 # instead of the one that POSIX recommends.
1007 info = {}
1008 info["name"] = "././@PaxHeader"
1009 info["type"] = type
1010 info["size"] = len(records)
1011 info["magic"] = POSIX_MAGIC
1012
1013 # Create pax header + record blocks.
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001014 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001015 cls._create_payload(records)
1016
Guido van Rossum75b64e62005-01-16 00:16:11 +00001017 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001018 def frombuf(cls, buf, encoding, errors):
1019 """Construct a TarInfo object from a 512 byte bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001020 """
Lars Gustäbel9520a432009-11-22 18:48:49 +00001021 if len(buf) == 0:
1022 raise EmptyHeaderError("empty header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001023 if len(buf) != BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001024 raise TruncatedHeaderError("truncated header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001025 if buf.count(NUL) == BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001026 raise EOFHeaderError("end of file header")
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001027
1028 chksum = nti(buf[148:156])
1029 if chksum not in calc_chksums(buf):
Lars Gustäbel9520a432009-11-22 18:48:49 +00001030 raise InvalidHeaderError("bad checksum")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001031
Guido van Rossumd8faa362007-04-27 19:54:29 +00001032 obj = cls()
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001033 obj.name = nts(buf[0:100], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001034 obj.mode = nti(buf[100:108])
1035 obj.uid = nti(buf[108:116])
1036 obj.gid = nti(buf[116:124])
1037 obj.size = nti(buf[124:136])
1038 obj.mtime = nti(buf[136:148])
1039 obj.chksum = chksum
1040 obj.type = buf[156:157]
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001041 obj.linkname = nts(buf[157:257], encoding, errors)
1042 obj.uname = nts(buf[265:297], encoding, errors)
1043 obj.gname = nts(buf[297:329], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001044 obj.devmajor = nti(buf[329:337])
1045 obj.devminor = nti(buf[337:345])
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001046 prefix = nts(buf[345:500], encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001047
Guido van Rossumd8faa362007-04-27 19:54:29 +00001048 # Old V7 tar format represents a directory as a regular
1049 # file with a trailing slash.
1050 if obj.type == AREGTYPE and obj.name.endswith("/"):
1051 obj.type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001052
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001053 # The old GNU sparse format occupies some of the unused
1054 # space in the buffer for up to 4 sparse structures.
1055 # Save the them for later processing in _proc_sparse().
1056 if obj.type == GNUTYPE_SPARSE:
1057 pos = 386
1058 structs = []
1059 for i in range(4):
1060 try:
1061 offset = nti(buf[pos:pos + 12])
1062 numbytes = nti(buf[pos + 12:pos + 24])
1063 except ValueError:
1064 break
1065 structs.append((offset, numbytes))
1066 pos += 24
1067 isextended = bool(buf[482])
1068 origsize = nti(buf[483:495])
1069 obj._sparse_structs = (structs, isextended, origsize)
1070
Guido van Rossumd8faa362007-04-27 19:54:29 +00001071 # Remove redundant slashes from directories.
1072 if obj.isdir():
1073 obj.name = obj.name.rstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001074
Guido van Rossumd8faa362007-04-27 19:54:29 +00001075 # Reconstruct a ustar longname.
1076 if prefix and obj.type not in GNU_TYPES:
1077 obj.name = prefix + "/" + obj.name
1078 return obj
1079
1080 @classmethod
1081 def fromtarfile(cls, tarfile):
1082 """Return the next TarInfo object from TarFile object
1083 tarfile.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001084 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001085 buf = tarfile.fileobj.read(BLOCKSIZE)
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001086 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001087 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1088 return obj._proc_member(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001089
Guido van Rossumd8faa362007-04-27 19:54:29 +00001090 #--------------------------------------------------------------------------
1091 # The following are methods that are called depending on the type of a
1092 # member. The entry point is _proc_member() which can be overridden in a
1093 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1094 # implement the following
1095 # operations:
1096 # 1. Set self.offset_data to the position where the data blocks begin,
1097 # if there is data that follows.
1098 # 2. Set tarfile.offset to the position where the next member's header will
1099 # begin.
1100 # 3. Return self or another valid TarInfo object.
1101 def _proc_member(self, tarfile):
1102 """Choose the right processing method depending on
1103 the type and call it.
Thomas Wouters89f507f2006-12-13 04:49:30 +00001104 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001105 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1106 return self._proc_gnulong(tarfile)
1107 elif self.type == GNUTYPE_SPARSE:
1108 return self._proc_sparse(tarfile)
1109 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1110 return self._proc_pax(tarfile)
1111 else:
1112 return self._proc_builtin(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001113
Guido van Rossumd8faa362007-04-27 19:54:29 +00001114 def _proc_builtin(self, tarfile):
1115 """Process a builtin type or an unknown type which
1116 will be treated as a regular file.
1117 """
1118 self.offset_data = tarfile.fileobj.tell()
1119 offset = self.offset_data
1120 if self.isreg() or self.type not in SUPPORTED_TYPES:
1121 # Skip the following data blocks.
1122 offset += self._block(self.size)
1123 tarfile.offset = offset
Thomas Wouters89f507f2006-12-13 04:49:30 +00001124
Guido van Rossume7ba4952007-06-06 23:52:48 +00001125 # Patch the TarInfo object with saved global
Guido van Rossumd8faa362007-04-27 19:54:29 +00001126 # header information.
Guido van Rossume7ba4952007-06-06 23:52:48 +00001127 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001128
1129 return self
1130
1131 def _proc_gnulong(self, tarfile):
1132 """Process the blocks that hold a GNU longname
1133 or longlink member.
1134 """
1135 buf = tarfile.fileobj.read(self._block(self.size))
1136
1137 # Fetch the next header and process it.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001138 try:
1139 next = self.fromtarfile(tarfile)
1140 except HeaderError:
1141 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001142
1143 # Patch the TarInfo object from the next header with
1144 # the longname information.
1145 next.offset = self.offset
1146 if self.type == GNUTYPE_LONGNAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001147 next.name = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001148 elif self.type == GNUTYPE_LONGLINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001149 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001150
1151 return next
1152
1153 def _proc_sparse(self, tarfile):
1154 """Process a GNU sparse header plus extra headers.
1155 """
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001156 # We already collected some sparse structures in frombuf().
1157 structs, isextended, origsize = self._sparse_structs
1158 del self._sparse_structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001159
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001160 # Collect sparse structures from extended header blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001161 while isextended:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001162 buf = tarfile.fileobj.read(BLOCKSIZE)
1163 pos = 0
Guido van Rossum805365e2007-05-07 22:24:25 +00001164 for i in range(21):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001165 try:
1166 offset = nti(buf[pos:pos + 12])
1167 numbytes = nti(buf[pos + 12:pos + 24])
1168 except ValueError:
1169 break
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001170 if offset and numbytes:
1171 structs.append((offset, numbytes))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001172 pos += 24
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001173 isextended = bool(buf[504])
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001174 self.sparse = structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001175
1176 self.offset_data = tarfile.fileobj.tell()
1177 tarfile.offset = self.offset_data + self._block(self.size)
1178 self.size = origsize
Guido van Rossumd8faa362007-04-27 19:54:29 +00001179 return self
1180
1181 def _proc_pax(self, tarfile):
1182 """Process an extended or global header as described in
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001183 POSIX.1-2008.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001184 """
1185 # Read the header information.
1186 buf = tarfile.fileobj.read(self._block(self.size))
1187
1188 # A pax header stores supplemental information for either
1189 # the following file (extended) or all following files
1190 # (global).
1191 if self.type == XGLTYPE:
1192 pax_headers = tarfile.pax_headers
1193 else:
1194 pax_headers = tarfile.pax_headers.copy()
1195
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001196 # Check if the pax header contains a hdrcharset field. This tells us
1197 # the encoding of the path, linkpath, uname and gname fields. Normally,
1198 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1199 # implementations are allowed to store them as raw binary strings if
1200 # the translation to UTF-8 fails.
1201 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1202 if match is not None:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001203 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001204
1205 # For the time being, we don't care about anything other than "BINARY".
1206 # The only other value that is currently allowed by the standard is
1207 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1208 hdrcharset = pax_headers.get("hdrcharset")
1209 if hdrcharset == "BINARY":
1210 encoding = tarfile.encoding
1211 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001212 encoding = "utf-8"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001213
Guido van Rossumd8faa362007-04-27 19:54:29 +00001214 # Parse pax header information. A record looks like that:
1215 # "%d %s=%s\n" % (length, keyword, value). length is the size
1216 # of the complete record including the length field itself and
Guido van Rossume7ba4952007-06-06 23:52:48 +00001217 # the newline. keyword and value are both UTF-8 encoded strings.
Antoine Pitroufd036452008-08-19 17:56:33 +00001218 regex = re.compile(br"(\d+) ([^=]+)=")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001219 pos = 0
1220 while True:
1221 match = regex.match(buf, pos)
1222 if not match:
1223 break
1224
1225 length, keyword = match.groups()
1226 length = int(length)
1227 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1228
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001229 # Normally, we could just use "utf-8" as the encoding and "strict"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001230 # as the error handler, but we better not take the risk. For
1231 # example, GNU tar <= 1.23 is known to store filenames it cannot
1232 # translate to UTF-8 as raw strings (unfortunately without a
1233 # hdrcharset=BINARY header).
1234 # We first try the strict standard encoding, and if that fails we
1235 # fall back on the user's encoding and error handler.
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001236 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001237 tarfile.errors)
1238 if keyword in PAX_NAME_FIELDS:
1239 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1240 tarfile.errors)
1241 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001242 value = self._decode_pax_field(value, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001243 tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001244
1245 pax_headers[keyword] = value
1246 pos += length
1247
Guido van Rossume7ba4952007-06-06 23:52:48 +00001248 # Fetch the next header.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001249 try:
1250 next = self.fromtarfile(tarfile)
1251 except HeaderError:
1252 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001253
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001254 # Process GNU sparse information.
1255 if "GNU.sparse.map" in pax_headers:
1256 # GNU extended sparse format version 0.1.
1257 self._proc_gnusparse_01(next, pax_headers)
1258
1259 elif "GNU.sparse.size" in pax_headers:
1260 # GNU extended sparse format version 0.0.
1261 self._proc_gnusparse_00(next, pax_headers, buf)
1262
1263 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1264 # GNU extended sparse format version 1.0.
1265 self._proc_gnusparse_10(next, pax_headers, tarfile)
1266
Guido van Rossume7ba4952007-06-06 23:52:48 +00001267 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001268 # Patch the TarInfo object with the extended header info.
1269 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1270 next.offset = self.offset
1271
1272 if "size" in pax_headers:
1273 # If the extended header replaces the size field,
1274 # we need to recalculate the offset where the next
1275 # header starts.
1276 offset = next.offset_data
1277 if next.isreg() or next.type not in SUPPORTED_TYPES:
1278 offset += next._block(next.size)
1279 tarfile.offset = offset
1280
1281 return next
1282
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001283 def _proc_gnusparse_00(self, next, pax_headers, buf):
1284 """Process a GNU tar extended sparse header, version 0.0.
1285 """
1286 offsets = []
1287 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1288 offsets.append(int(match.group(1)))
1289 numbytes = []
1290 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1291 numbytes.append(int(match.group(1)))
1292 next.sparse = list(zip(offsets, numbytes))
1293
1294 def _proc_gnusparse_01(self, next, pax_headers):
1295 """Process a GNU tar extended sparse header, version 0.1.
1296 """
1297 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1298 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1299
1300 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1301 """Process a GNU tar extended sparse header, version 1.0.
1302 """
1303 fields = None
1304 sparse = []
1305 buf = tarfile.fileobj.read(BLOCKSIZE)
1306 fields, buf = buf.split(b"\n", 1)
1307 fields = int(fields)
1308 while len(sparse) < fields * 2:
1309 if b"\n" not in buf:
1310 buf += tarfile.fileobj.read(BLOCKSIZE)
1311 number, buf = buf.split(b"\n", 1)
1312 sparse.append(int(number))
1313 next.offset_data = tarfile.fileobj.tell()
1314 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1315
Guido van Rossume7ba4952007-06-06 23:52:48 +00001316 def _apply_pax_info(self, pax_headers, encoding, errors):
1317 """Replace fields with supplemental information from a previous
1318 pax extended or global header.
1319 """
1320 for keyword, value in pax_headers.items():
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001321 if keyword == "GNU.sparse.name":
1322 setattr(self, "path", value)
1323 elif keyword == "GNU.sparse.size":
1324 setattr(self, "size", int(value))
1325 elif keyword == "GNU.sparse.realsize":
1326 setattr(self, "size", int(value))
1327 elif keyword in PAX_FIELDS:
1328 if keyword in PAX_NUMBER_FIELDS:
1329 try:
1330 value = PAX_NUMBER_FIELDS[keyword](value)
1331 except ValueError:
1332 value = 0
1333 if keyword == "path":
1334 value = value.rstrip("/")
1335 setattr(self, keyword, value)
Guido van Rossume7ba4952007-06-06 23:52:48 +00001336
1337 self.pax_headers = pax_headers.copy()
Guido van Rossumd8faa362007-04-27 19:54:29 +00001338
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001339 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1340 """Decode a single field from a pax record.
1341 """
1342 try:
1343 return value.decode(encoding, "strict")
1344 except UnicodeDecodeError:
1345 return value.decode(fallback_encoding, fallback_errors)
1346
Guido van Rossumd8faa362007-04-27 19:54:29 +00001347 def _block(self, count):
1348 """Round up a byte count by BLOCKSIZE and return it,
1349 e.g. _block(834) => 1024.
1350 """
1351 blocks, remainder = divmod(count, BLOCKSIZE)
1352 if remainder:
1353 blocks += 1
1354 return blocks * BLOCKSIZE
Thomas Wouters89f507f2006-12-13 04:49:30 +00001355
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001356 def isreg(self):
1357 return self.type in REGULAR_TYPES
1358 def isfile(self):
1359 return self.isreg()
1360 def isdir(self):
1361 return self.type == DIRTYPE
1362 def issym(self):
1363 return self.type == SYMTYPE
1364 def islnk(self):
1365 return self.type == LNKTYPE
1366 def ischr(self):
1367 return self.type == CHRTYPE
1368 def isblk(self):
1369 return self.type == BLKTYPE
1370 def isfifo(self):
1371 return self.type == FIFOTYPE
1372 def issparse(self):
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001373 return self.sparse is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001374 def isdev(self):
1375 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1376# class TarInfo
1377
1378class TarFile(object):
1379 """The TarFile Class provides an interface to tar archives.
1380 """
1381
1382 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1383
1384 dereference = False # If true, add content of linked file to the
1385 # tar file, else the link.
1386
1387 ignore_zeros = False # If true, skips empty or invalid blocks and
1388 # continues processing.
1389
Lars Gustäbel365aff32009-12-13 11:42:29 +00001390 errorlevel = 1 # If 0, fatal errors only appear in debug
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001391 # messages (if debug >= 0). If > 0, errors
1392 # are passed to the caller as exceptions.
1393
Guido van Rossumd8faa362007-04-27 19:54:29 +00001394 format = DEFAULT_FORMAT # The format to use when creating an archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001395
Guido van Rossume7ba4952007-06-06 23:52:48 +00001396 encoding = ENCODING # Encoding for 8-bit character strings.
1397
1398 errors = None # Error handler for unicode conversion.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001399
Guido van Rossumd8faa362007-04-27 19:54:29 +00001400 tarinfo = TarInfo # The default TarInfo class to use.
1401
Lars Gustäbelb062a2f2012-05-14 13:18:16 +02001402 fileobject = ExFileObject # The file-object for extractfile().
Guido van Rossumd8faa362007-04-27 19:54:29 +00001403
1404 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1405 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
Victor Stinnerde629d42010-05-05 21:43:57 +00001406 errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001407 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1408 read from an existing archive, 'a' to append data to an existing
1409 file or 'w' to create a new file overwriting an existing one. `mode'
1410 defaults to 'r'.
1411 If `fileobj' is given, it is used for reading or writing data. If it
1412 can be determined, `mode' is overridden by `fileobj's mode.
1413 `fileobj' is not closed, when TarFile is closed.
1414 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001415 modes = {"r": "rb", "a": "r+b", "w": "wb", "x": "xb"}
Serhiy Storchaka53ad0cd2014-01-18 15:35:37 +02001416 if mode not in modes:
Berker Peksag0fe63252015-02-13 21:02:12 +02001417 raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001418 self.mode = mode
Serhiy Storchaka53ad0cd2014-01-18 15:35:37 +02001419 self._mode = modes[mode]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001420
1421 if not fileobj:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001422 if self.mode == "a" and not os.path.exists(name):
Thomas Wouterscf297e42007-02-23 15:07:44 +00001423 # Create nonexistent files in append mode.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001424 self.mode = "w"
1425 self._mode = "wb"
Guido van Rossume7ba4952007-06-06 23:52:48 +00001426 fileobj = bltn_open(name, self._mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001427 self._extfileobj = False
1428 else:
Serhiy Storchaka2c6a3ae2014-07-16 23:58:58 +03001429 if (name is None and hasattr(fileobj, "name") and
1430 isinstance(fileobj.name, (str, bytes))):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001431 name = fileobj.name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001432 if hasattr(fileobj, "mode"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001433 self._mode = fileobj.mode
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001434 self._extfileobj = True
Thomas Woutersed03b412007-08-28 21:37:11 +00001435 self.name = os.path.abspath(name) if name else None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001436 self.fileobj = fileobj
1437
Guido van Rossumd8faa362007-04-27 19:54:29 +00001438 # Init attributes.
1439 if format is not None:
1440 self.format = format
1441 if tarinfo is not None:
1442 self.tarinfo = tarinfo
1443 if dereference is not None:
1444 self.dereference = dereference
1445 if ignore_zeros is not None:
1446 self.ignore_zeros = ignore_zeros
1447 if encoding is not None:
1448 self.encoding = encoding
Victor Stinnerde629d42010-05-05 21:43:57 +00001449 self.errors = errors
Guido van Rossume7ba4952007-06-06 23:52:48 +00001450
1451 if pax_headers is not None and self.format == PAX_FORMAT:
1452 self.pax_headers = pax_headers
1453 else:
1454 self.pax_headers = {}
1455
Guido van Rossumd8faa362007-04-27 19:54:29 +00001456 if debug is not None:
1457 self.debug = debug
1458 if errorlevel is not None:
1459 self.errorlevel = errorlevel
1460
1461 # Init datastructures.
Thomas Wouters477c8d52006-05-27 19:21:47 +00001462 self.closed = False
1463 self.members = [] # list of members as TarInfo objects
1464 self._loaded = False # flag if all members have been read
Christian Heimesd8654cf2007-12-02 15:22:16 +00001465 self.offset = self.fileobj.tell()
1466 # current position in the archive file
Thomas Wouters477c8d52006-05-27 19:21:47 +00001467 self.inodes = {} # dictionary caching the inodes of
1468 # archive members already added
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001469
Lars Gustäbel7b465392009-11-18 20:29:25 +00001470 try:
1471 if self.mode == "r":
1472 self.firstmember = None
1473 self.firstmember = self.next()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001474
Lars Gustäbel7b465392009-11-18 20:29:25 +00001475 if self.mode == "a":
1476 # Move to the end of the archive,
1477 # before the first empty block.
Lars Gustäbel7b465392009-11-18 20:29:25 +00001478 while True:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001479 self.fileobj.seek(self.offset)
1480 try:
1481 tarinfo = self.tarinfo.fromtarfile(self)
1482 self.members.append(tarinfo)
1483 except EOFHeaderError:
1484 self.fileobj.seek(self.offset)
Lars Gustäbel7b465392009-11-18 20:29:25 +00001485 break
Lars Gustäbel9520a432009-11-22 18:48:49 +00001486 except HeaderError as e:
1487 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001488
Lars Gustäbel20703c62015-05-27 12:53:44 +02001489 if self.mode in ("a", "w", "x"):
Lars Gustäbel7b465392009-11-18 20:29:25 +00001490 self._loaded = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001491
Lars Gustäbel7b465392009-11-18 20:29:25 +00001492 if self.pax_headers:
1493 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1494 self.fileobj.write(buf)
1495 self.offset += len(buf)
1496 except:
1497 if not self._extfileobj:
1498 self.fileobj.close()
1499 self.closed = True
1500 raise
Guido van Rossumd8faa362007-04-27 19:54:29 +00001501
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001502 #--------------------------------------------------------------------------
1503 # Below are the classmethods which act as alternate constructors to the
1504 # TarFile class. The open() method is the only one that is needed for
1505 # public use; it is the "super"-constructor and is able to select an
1506 # adequate "sub"-constructor for a particular compression using the mapping
1507 # from OPEN_METH.
1508 #
1509 # This concept allows one to subclass TarFile without losing the comfort of
1510 # the super-constructor. A sub-constructor is registered and made available
1511 # by adding it to the mapping in OPEN_METH.
1512
Guido van Rossum75b64e62005-01-16 00:16:11 +00001513 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001514 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001515 """Open a tar archive for reading, writing or appending. Return
1516 an appropriate TarFile class.
1517
1518 mode:
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001519 'r' or 'r:*' open for reading with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001520 'r:' open for reading exclusively uncompressed
1521 'r:gz' open for reading with gzip compression
1522 'r:bz2' open for reading with bzip2 compression
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001523 'r:xz' open for reading with lzma compression
Thomas Wouterscf297e42007-02-23 15:07:44 +00001524 'a' or 'a:' open for appending, creating the file if necessary
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001525 'w' or 'w:' open for writing without compression
1526 'w:gz' open for writing with gzip compression
1527 'w:bz2' open for writing with bzip2 compression
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001528 'w:xz' open for writing with lzma compression
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001529
Berker Peksag0fe63252015-02-13 21:02:12 +02001530 'x' or 'x:' create a tarfile exclusively without compression, raise
1531 an exception if the file is already created
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +03001532 'x:gz' create a gzip compressed tarfile, raise an exception
Berker Peksag0fe63252015-02-13 21:02:12 +02001533 if the file is already created
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +03001534 'x:bz2' create a bzip2 compressed tarfile, raise an exception
Berker Peksag0fe63252015-02-13 21:02:12 +02001535 if the file is already created
1536 'x:xz' create an lzma compressed tarfile, raise an exception
1537 if the file is already created
1538
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001539 'r|*' open a stream of tar blocks with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001540 'r|' open an uncompressed stream of tar blocks for reading
1541 'r|gz' open a gzip compressed stream of tar blocks
1542 'r|bz2' open a bzip2 compressed stream of tar blocks
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001543 'r|xz' open an lzma compressed stream of tar blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001544 'w|' open an uncompressed stream for writing
1545 'w|gz' open a gzip compressed stream for writing
1546 'w|bz2' open a bzip2 compressed stream for writing
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001547 'w|xz' open an lzma compressed stream for writing
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001548 """
1549
1550 if not name and not fileobj:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001551 raise ValueError("nothing to open")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001552
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001553 if mode in ("r", "r:*"):
1554 # Find out which *open() is appropriate for opening the file.
1555 for comptype in cls.OPEN_METH:
1556 func = getattr(cls, cls.OPEN_METH[comptype])
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001557 if fileobj is not None:
1558 saved_pos = fileobj.tell()
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001559 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001560 return func(name, "r", fileobj, **kwargs)
1561 except (ReadError, CompressionError) as e:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001562 if fileobj is not None:
1563 fileobj.seek(saved_pos)
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001564 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001565 raise ReadError("file could not be opened successfully")
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001566
1567 elif ":" in mode:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001568 filemode, comptype = mode.split(":", 1)
1569 filemode = filemode or "r"
1570 comptype = comptype or "tar"
1571
1572 # Select the *open() function according to
1573 # given compression.
1574 if comptype in cls.OPEN_METH:
1575 func = getattr(cls, cls.OPEN_METH[comptype])
1576 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001577 raise CompressionError("unknown compression type %r" % comptype)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001578 return func(name, filemode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001579
1580 elif "|" in mode:
1581 filemode, comptype = mode.split("|", 1)
1582 filemode = filemode or "r"
1583 comptype = comptype or "tar"
1584
Serhiy Storchaka53ad0cd2014-01-18 15:35:37 +02001585 if filemode not in ("r", "w"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001586 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001587
Antoine Pitrou605c2932010-09-23 20:15:14 +00001588 stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1589 try:
1590 t = cls(name, filemode, stream, **kwargs)
1591 except:
1592 stream.close()
1593 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001594 t._extfileobj = False
1595 return t
1596
Berker Peksag0fe63252015-02-13 21:02:12 +02001597 elif mode in ("a", "w", "x"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001598 return cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001599
Thomas Wouters477c8d52006-05-27 19:21:47 +00001600 raise ValueError("undiscernible mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001601
Guido van Rossum75b64e62005-01-16 00:16:11 +00001602 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001603 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001604 """Open uncompressed tar archive name for reading or writing.
1605 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001606 if mode not in ("r", "a", "w", "x"):
1607 raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001608 return cls(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001609
Guido van Rossum75b64e62005-01-16 00:16:11 +00001610 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001611 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001612 """Open gzip compressed tar archive name for reading or writing.
1613 Appending is not allowed.
1614 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001615 if mode not in ("r", "w", "x"):
1616 raise ValueError("mode must be 'r', 'w' or 'x'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001617
1618 try:
1619 import gzip
Neal Norwitz4ec68242003-04-11 03:05:56 +00001620 gzip.GzipFile
1621 except (ImportError, AttributeError):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001622 raise CompressionError("gzip module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001623
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001624 try:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001625 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001626 except OSError:
1627 if fileobj is not None and mode == 'r':
1628 raise ReadError("not a gzip file")
1629 raise
1630
1631 try:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001632 t = cls.taropen(name, mode, fileobj, **kwargs)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001633 except OSError:
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001634 fileobj.close()
1635 if mode == 'r':
1636 raise ReadError("not a gzip file")
1637 raise
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001638 except:
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001639 fileobj.close()
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001640 raise
Serhiy Storchaka9fbec7a2014-01-18 15:53:05 +02001641 t._extfileobj = False
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001642 return t
1643
Guido van Rossum75b64e62005-01-16 00:16:11 +00001644 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001645 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001646 """Open bzip2 compressed tar archive name for reading or writing.
1647 Appending is not allowed.
1648 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001649 if mode not in ("r", "w", "x"):
1650 raise ValueError("mode must be 'r', 'w' or 'x'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001651
1652 try:
1653 import bz2
Brett Cannoncd171c82013-07-04 17:43:24 -04001654 except ImportError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001655 raise CompressionError("bz2 module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001656
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +02001657 fileobj = bz2.BZ2File(fileobj or name, mode,
1658 compresslevel=compresslevel)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001659
1660 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001661 t = cls.taropen(name, mode, fileobj, **kwargs)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001662 except (OSError, EOFError):
Antoine Pitrou95f55602010-09-23 18:36:46 +00001663 fileobj.close()
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001664 if mode == 'r':
1665 raise ReadError("not a bzip2 file")
1666 raise
Serhiy Storchakae413cde2014-01-18 16:28:08 +02001667 except:
1668 fileobj.close()
1669 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001670 t._extfileobj = False
1671 return t
1672
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001673 @classmethod
Lars Gustäbelc5e11992012-01-18 14:01:17 +01001674 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001675 """Open lzma compressed tar archive name for reading or writing.
1676 Appending is not allowed.
1677 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001678 if mode not in ("r", "w", "x"):
1679 raise ValueError("mode must be 'r', 'w' or 'x'")
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001680
1681 try:
1682 import lzma
Brett Cannoncd171c82013-07-04 17:43:24 -04001683 except ImportError:
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001684 raise CompressionError("lzma module is not available")
1685
Nadeem Vawda33c34da2012-06-04 23:34:07 +02001686 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001687
1688 try:
1689 t = cls.taropen(name, mode, fileobj, **kwargs)
1690 except (lzma.LZMAError, EOFError):
1691 fileobj.close()
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001692 if mode == 'r':
1693 raise ReadError("not an lzma file")
1694 raise
Serhiy Storchakae413cde2014-01-18 16:28:08 +02001695 except:
1696 fileobj.close()
1697 raise
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001698 t._extfileobj = False
1699 return t
1700
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001701 # All *open() methods are registered here.
1702 OPEN_METH = {
1703 "tar": "taropen", # uncompressed tar
1704 "gz": "gzopen", # gzip compressed tar
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001705 "bz2": "bz2open", # bzip2 compressed tar
1706 "xz": "xzopen" # lzma compressed tar
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001707 }
1708
1709 #--------------------------------------------------------------------------
1710 # The public methods which TarFile provides:
1711
1712 def close(self):
1713 """Close the TarFile. In write-mode, two finishing zero blocks are
1714 appended to the archive.
1715 """
1716 if self.closed:
1717 return
1718
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001719 self.closed = True
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +03001720 try:
Lars Gustäbel20703c62015-05-27 12:53:44 +02001721 if self.mode in ("a", "w", "x"):
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +03001722 self.fileobj.write(NUL * (BLOCKSIZE * 2))
1723 self.offset += (BLOCKSIZE * 2)
1724 # fill up the end with zero-blocks
1725 # (like option -b20 for tar does)
1726 blocks, remainder = divmod(self.offset, RECORDSIZE)
1727 if remainder > 0:
1728 self.fileobj.write(NUL * (RECORDSIZE - remainder))
1729 finally:
1730 if not self._extfileobj:
1731 self.fileobj.close()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001732
1733 def getmember(self, name):
1734 """Return a TarInfo object for member `name'. If `name' can not be
1735 found in the archive, KeyError is raised. If a member occurs more
Mark Dickinson934896d2009-02-21 20:59:32 +00001736 than once in the archive, its last occurrence is assumed to be the
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001737 most up-to-date version.
1738 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001739 tarinfo = self._getmember(name)
1740 if tarinfo is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001741 raise KeyError("filename %r not found" % name)
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001742 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001743
1744 def getmembers(self):
1745 """Return the members of the archive as a list of TarInfo objects. The
1746 list has the same order as the members in the archive.
1747 """
1748 self._check()
1749 if not self._loaded: # if we want to obtain a list of
1750 self._load() # all members, we first have to
1751 # scan the whole archive.
1752 return self.members
1753
1754 def getnames(self):
1755 """Return the members of the archive as a list of their names. It has
1756 the same order as the list returned by getmembers().
1757 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001758 return [tarinfo.name for tarinfo in self.getmembers()]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001759
1760 def gettarinfo(self, name=None, arcname=None, fileobj=None):
Martin Panterf817a482016-02-19 23:34:56 +00001761 """Create a TarInfo object from the result of os.stat or equivalent
1762 on an existing file. The file is either named by `name', or
1763 specified as a file object `fileobj' with a file descriptor. If
1764 given, `arcname' specifies an alternative name for the file in the
1765 archive, otherwise, the name is taken from the 'name' attribute of
1766 'fileobj', or the 'name' argument. The name should be a text
1767 string.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001768 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001769 self._check("awx")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001770
1771 # When fileobj is given, replace name by
1772 # fileobj's real name.
1773 if fileobj is not None:
1774 name = fileobj.name
1775
1776 # Building the name of the member in the archive.
1777 # Backward slashes are converted to forward slashes,
1778 # Absolute paths are turned to relative paths.
1779 if arcname is None:
1780 arcname = name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001781 drv, arcname = os.path.splitdrive(arcname)
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00001782 arcname = arcname.replace(os.sep, "/")
1783 arcname = arcname.lstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001784
1785 # Now, fill the TarInfo object with
1786 # information specific for the file.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001787 tarinfo = self.tarinfo()
Martin Panterf817a482016-02-19 23:34:56 +00001788 tarinfo.tarfile = self # Not needed
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001789
1790 # Use os.stat or os.lstat, depending on platform
1791 # and if symlinks shall be resolved.
1792 if fileobj is None:
1793 if hasattr(os, "lstat") and not self.dereference:
1794 statres = os.lstat(name)
1795 else:
1796 statres = os.stat(name)
1797 else:
1798 statres = os.fstat(fileobj.fileno())
1799 linkname = ""
1800
1801 stmd = statres.st_mode
1802 if stat.S_ISREG(stmd):
1803 inode = (statres.st_ino, statres.st_dev)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001804 if not self.dereference and statres.st_nlink > 1 and \
1805 inode in self.inodes and arcname != self.inodes[inode]:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001806 # Is it a hardlink to an already
1807 # archived file?
1808 type = LNKTYPE
1809 linkname = self.inodes[inode]
1810 else:
1811 # The inode is added only if its valid.
1812 # For win32 it is always 0.
1813 type = REGTYPE
1814 if inode[0]:
1815 self.inodes[inode] = arcname
1816 elif stat.S_ISDIR(stmd):
1817 type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001818 elif stat.S_ISFIFO(stmd):
1819 type = FIFOTYPE
1820 elif stat.S_ISLNK(stmd):
1821 type = SYMTYPE
1822 linkname = os.readlink(name)
1823 elif stat.S_ISCHR(stmd):
1824 type = CHRTYPE
1825 elif stat.S_ISBLK(stmd):
1826 type = BLKTYPE
1827 else:
1828 return None
1829
1830 # Fill the TarInfo object with all
1831 # information we can get.
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001832 tarinfo.name = arcname
1833 tarinfo.mode = stmd
1834 tarinfo.uid = statres.st_uid
1835 tarinfo.gid = statres.st_gid
Lars Gustäbel2470ff12010-06-03 10:11:52 +00001836 if type == REGTYPE:
Martin v. Löwis61d77e02004-08-20 06:35:46 +00001837 tarinfo.size = statres.st_size
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001838 else:
Guido van Rossume2a383d2007-01-15 16:59:06 +00001839 tarinfo.size = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001840 tarinfo.mtime = statres.st_mtime
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001841 tarinfo.type = type
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001842 tarinfo.linkname = linkname
1843 if pwd:
1844 try:
1845 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1846 except KeyError:
1847 pass
1848 if grp:
1849 try:
1850 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1851 except KeyError:
1852 pass
1853
1854 if type in (CHRTYPE, BLKTYPE):
1855 if hasattr(os, "major") and hasattr(os, "minor"):
1856 tarinfo.devmajor = os.major(statres.st_rdev)
1857 tarinfo.devminor = os.minor(statres.st_rdev)
1858 return tarinfo
1859
Serhiy Storchakaa7eb7462014-08-21 10:01:16 +03001860 def list(self, verbose=True, *, members=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001861 """Print a table of contents to sys.stdout. If `verbose' is False, only
1862 the names of the members are printed. If it is True, an `ls -l'-like
Serhiy Storchakaa7eb7462014-08-21 10:01:16 +03001863 output is produced. `members' is optional and must be a subset of the
1864 list returned by getmembers().
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001865 """
1866 self._check()
1867
Serhiy Storchakaa7eb7462014-08-21 10:01:16 +03001868 if members is None:
1869 members = self
1870 for tarinfo in members:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001871 if verbose:
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001872 _safe_print(stat.filemode(tarinfo.mode))
1873 _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid,
1874 tarinfo.gname or tarinfo.gid))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001875 if tarinfo.ischr() or tarinfo.isblk():
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001876 _safe_print("%10s" %
1877 ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor)))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001878 else:
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001879 _safe_print("%10d" % tarinfo.size)
1880 _safe_print("%d-%02d-%02d %02d:%02d:%02d" \
1881 % time.localtime(tarinfo.mtime)[:6])
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001882
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001883 _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else ""))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001884
1885 if verbose:
1886 if tarinfo.issym():
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001887 _safe_print("-> " + tarinfo.linkname)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001888 if tarinfo.islnk():
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001889 _safe_print("link to " + tarinfo.linkname)
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001890 print()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001891
Raymond Hettingera63a3122011-01-26 20:34:14 +00001892 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001893 """Add the file `name' to the archive. `name' may be any type of file
1894 (directory, fifo, symbolic link, etc.). If given, `arcname'
1895 specifies an alternative name for the file in the archive.
1896 Directories are added recursively by default. This can be avoided by
Guido van Rossum486364b2007-06-30 05:01:58 +00001897 setting `recursive' to False. `exclude' is a function that should
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001898 return True for each filename to be excluded. `filter' is a function
1899 that expects a TarInfo object argument and returns the changed
1900 TarInfo object, if it returns None the TarInfo object will be
1901 excluded from the archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001902 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001903 self._check("awx")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001904
1905 if arcname is None:
1906 arcname = name
1907
Guido van Rossum486364b2007-06-30 05:01:58 +00001908 # Exclude pathnames.
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001909 if exclude is not None:
1910 import warnings
1911 warnings.warn("use the filter argument instead",
1912 DeprecationWarning, 2)
1913 if exclude(name):
1914 self._dbg(2, "tarfile: Excluded %r" % name)
1915 return
Guido van Rossum486364b2007-06-30 05:01:58 +00001916
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001917 # Skip if somebody tries to archive the archive...
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001918 if self.name is not None and os.path.abspath(name) == self.name:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001919 self._dbg(2, "tarfile: Skipped %r" % name)
1920 return
1921
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001922 self._dbg(1, name)
1923
1924 # Create a TarInfo object from the file.
1925 tarinfo = self.gettarinfo(name, arcname)
1926
1927 if tarinfo is None:
1928 self._dbg(1, "tarfile: Unsupported type %r" % name)
1929 return
1930
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001931 # Change or exclude the TarInfo object.
1932 if filter is not None:
1933 tarinfo = filter(tarinfo)
1934 if tarinfo is None:
1935 self._dbg(2, "tarfile: Excluded %r" % name)
1936 return
1937
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001938 # Append the tar header and data to the archive.
1939 if tarinfo.isreg():
Andrew Svetlov718df1d2012-11-29 14:20:47 +02001940 with bltn_open(name, "rb") as f:
1941 self.addfile(tarinfo, f)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001942
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001943 elif tarinfo.isdir():
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001944 self.addfile(tarinfo)
1945 if recursive:
1946 for f in os.listdir(name):
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001947 self.add(os.path.join(name, f), os.path.join(arcname, f),
Raymond Hettingera63a3122011-01-26 20:34:14 +00001948 recursive, exclude, filter=filter)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001949
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001950 else:
1951 self.addfile(tarinfo)
1952
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001953 def addfile(self, tarinfo, fileobj=None):
1954 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
Martin Panterf817a482016-02-19 23:34:56 +00001955 given, it should be a binary file, and tarinfo.size bytes are read
1956 from it and added to the archive. You can create TarInfo objects
1957 directly, or by using gettarinfo().
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001958 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001959 self._check("awx")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001960
Thomas Wouters89f507f2006-12-13 04:49:30 +00001961 tarinfo = copy.copy(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001962
Guido van Rossume7ba4952007-06-06 23:52:48 +00001963 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001964 self.fileobj.write(buf)
1965 self.offset += len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001966
1967 # If there's data to follow, append it.
1968 if fileobj is not None:
1969 copyfileobj(fileobj, self.fileobj, tarinfo.size)
1970 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
1971 if remainder > 0:
1972 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
1973 blocks += 1
1974 self.offset += blocks * BLOCKSIZE
1975
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001976 self.members.append(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001977
Eric V. Smith7a803892015-04-15 10:27:58 -04001978 def extractall(self, path=".", members=None, *, numeric_owner=False):
Martin v. Löwis00a73e72005-03-04 19:40:34 +00001979 """Extract all members from the archive to the current working
1980 directory and set owner, modification time and permissions on
1981 directories afterwards. `path' specifies a different directory
1982 to extract to. `members' is optional and must be a subset of the
Eric V. Smith7a803892015-04-15 10:27:58 -04001983 list returned by getmembers(). If `numeric_owner` is True, only
1984 the numbers for user/group names are used and not the names.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00001985 """
1986 directories = []
1987
1988 if members is None:
1989 members = self
1990
1991 for tarinfo in members:
1992 if tarinfo.isdir():
Christian Heimes2202f872008-02-06 14:31:34 +00001993 # Extract directories with a safe mode.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00001994 directories.append(tarinfo)
Christian Heimes2202f872008-02-06 14:31:34 +00001995 tarinfo = copy.copy(tarinfo)
1996 tarinfo.mode = 0o700
Martin v. Löwis16f344d2010-11-01 21:39:13 +00001997 # Do not set_attrs directories, as we will do that further down
Eric V. Smith7a803892015-04-15 10:27:58 -04001998 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(),
1999 numeric_owner=numeric_owner)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002000
2001 # Reverse sort directories.
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +00002002 directories.sort(key=lambda a: a.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002003 directories.reverse()
2004
2005 # Set correct owner, mtime and filemode on directories.
2006 for tarinfo in directories:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002007 dirpath = os.path.join(path, tarinfo.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002008 try:
Eric V. Smith7a803892015-04-15 10:27:58 -04002009 self.chown(tarinfo, dirpath, numeric_owner=numeric_owner)
Christian Heimesfaf2f632008-01-06 16:59:19 +00002010 self.utime(tarinfo, dirpath)
2011 self.chmod(tarinfo, dirpath)
Guido van Rossumb940e112007-01-10 16:19:56 +00002012 except ExtractError as e:
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002013 if self.errorlevel > 1:
2014 raise
2015 else:
2016 self._dbg(1, "tarfile: %s" % e)
2017
Eric V. Smith7a803892015-04-15 10:27:58 -04002018 def extract(self, member, path="", set_attrs=True, *, numeric_owner=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002019 """Extract a member from the archive to the current working directory,
2020 using its full name. Its file information is extracted as accurately
2021 as possible. `member' may be a filename or a TarInfo object. You can
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002022 specify a different directory using `path'. File attributes (owner,
Eric V. Smith7a803892015-04-15 10:27:58 -04002023 mtime, mode) are set unless `set_attrs' is False. If `numeric_owner`
2024 is True, only the numbers for user/group names are used and not
2025 the names.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002026 """
2027 self._check("r")
2028
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002029 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002030 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002031 else:
2032 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002033
Neal Norwitza4f651a2004-07-20 22:07:44 +00002034 # Prepare the link target for makelink().
2035 if tarinfo.islnk():
2036 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2037
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002038 try:
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002039 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
Eric V. Smith7a803892015-04-15 10:27:58 -04002040 set_attrs=set_attrs,
2041 numeric_owner=numeric_owner)
Andrew Svetlov3438fa42012-12-17 23:35:18 +02002042 except OSError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002043 if self.errorlevel > 0:
2044 raise
2045 else:
2046 if e.filename is None:
2047 self._dbg(1, "tarfile: %s" % e.strerror)
2048 else:
2049 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
Guido van Rossumb940e112007-01-10 16:19:56 +00002050 except ExtractError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002051 if self.errorlevel > 1:
2052 raise
2053 else:
2054 self._dbg(1, "tarfile: %s" % e)
2055
2056 def extractfile(self, member):
2057 """Extract a member from the archive as a file object. `member' may be
Lars Gustäbel7a919e92012-05-05 18:15:03 +02002058 a filename or a TarInfo object. If `member' is a regular file or a
2059 link, an io.BufferedReader object is returned. Otherwise, None is
2060 returned.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002061 """
2062 self._check("r")
2063
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002064 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002065 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002066 else:
2067 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002068
Lars Gustäbel7a919e92012-05-05 18:15:03 +02002069 if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES:
2070 # Members with unknown types are treated as regular files.
Lars Gustäbelb062a2f2012-05-14 13:18:16 +02002071 return self.fileobject(self, tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002072
2073 elif tarinfo.islnk() or tarinfo.issym():
2074 if isinstance(self.fileobj, _Stream):
2075 # A small but ugly workaround for the case that someone tries
2076 # to extract a (sym)link as a file-object from a non-seekable
2077 # stream of tar blocks.
Thomas Wouters477c8d52006-05-27 19:21:47 +00002078 raise StreamError("cannot extract (sym)link as file object")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002079 else:
Georg Brandl7eb4b7d2005-07-22 21:49:32 +00002080 # A (sym)link's file object is its target's file object.
Lars Gustäbel1b512722010-06-03 12:45:16 +00002081 return self.extractfile(self._find_link_target(tarinfo))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002082 else:
2083 # If there's no data associated with the member (directory, chrdev,
2084 # blkdev, etc.), return None instead of a file object.
2085 return None
2086
Eric V. Smith7a803892015-04-15 10:27:58 -04002087 def _extract_member(self, tarinfo, targetpath, set_attrs=True,
2088 numeric_owner=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002089 """Extract the TarInfo object tarinfo to a physical
2090 file called targetpath.
2091 """
2092 # Fetch the TarInfo object for the given name
2093 # and build the destination pathname, replacing
2094 # forward slashes to platform specific separators.
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002095 targetpath = targetpath.rstrip("/")
2096 targetpath = targetpath.replace("/", os.sep)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002097
2098 # Create all upper directories.
2099 upperdirs = os.path.dirname(targetpath)
2100 if upperdirs and not os.path.exists(upperdirs):
Christian Heimes2202f872008-02-06 14:31:34 +00002101 # Create directories that are not part of the archive with
2102 # default permissions.
Thomas Woutersb2137042007-02-01 18:02:27 +00002103 os.makedirs(upperdirs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002104
2105 if tarinfo.islnk() or tarinfo.issym():
2106 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2107 else:
2108 self._dbg(1, tarinfo.name)
2109
2110 if tarinfo.isreg():
2111 self.makefile(tarinfo, targetpath)
2112 elif tarinfo.isdir():
2113 self.makedir(tarinfo, targetpath)
2114 elif tarinfo.isfifo():
2115 self.makefifo(tarinfo, targetpath)
2116 elif tarinfo.ischr() or tarinfo.isblk():
2117 self.makedev(tarinfo, targetpath)
2118 elif tarinfo.islnk() or tarinfo.issym():
2119 self.makelink(tarinfo, targetpath)
2120 elif tarinfo.type not in SUPPORTED_TYPES:
2121 self.makeunknown(tarinfo, targetpath)
2122 else:
2123 self.makefile(tarinfo, targetpath)
2124
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002125 if set_attrs:
Eric V. Smith7a803892015-04-15 10:27:58 -04002126 self.chown(tarinfo, targetpath, numeric_owner)
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002127 if not tarinfo.issym():
2128 self.chmod(tarinfo, targetpath)
2129 self.utime(tarinfo, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002130
2131 #--------------------------------------------------------------------------
2132 # Below are the different file methods. They are called via
2133 # _extract_member() when extract() is called. They can be replaced in a
2134 # subclass to implement other functionality.
2135
2136 def makedir(self, tarinfo, targetpath):
2137 """Make a directory called targetpath.
2138 """
2139 try:
Christian Heimes2202f872008-02-06 14:31:34 +00002140 # Use a safe mode for the directory, the real mode is set
2141 # later in _extract_member().
2142 os.mkdir(targetpath, 0o700)
Florent Xicluna68f71a32011-10-28 16:06:23 +02002143 except FileExistsError:
2144 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002145
2146 def makefile(self, tarinfo, targetpath):
2147 """Make a file called targetpath.
2148 """
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002149 source = self.fileobj
2150 source.seek(tarinfo.offset_data)
Andrew Svetlov718df1d2012-11-29 14:20:47 +02002151 with bltn_open(targetpath, "wb") as target:
2152 if tarinfo.sparse is not None:
2153 for offset, size in tarinfo.sparse:
2154 target.seek(offset)
Lars Gustäbel03572682015-07-06 09:27:24 +02002155 copyfileobj(source, target, size, ReadError)
Łukasz Langae7f27482016-06-11 16:42:36 -07002156 target.seek(tarinfo.size)
2157 target.truncate()
Andrew Svetlov718df1d2012-11-29 14:20:47 +02002158 else:
Lars Gustäbel03572682015-07-06 09:27:24 +02002159 copyfileobj(source, target, tarinfo.size, ReadError)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002160
2161 def makeunknown(self, tarinfo, targetpath):
2162 """Make a file from a TarInfo object with an unknown type
2163 at targetpath.
2164 """
2165 self.makefile(tarinfo, targetpath)
2166 self._dbg(1, "tarfile: Unknown file type %r, " \
2167 "extracted as regular file." % tarinfo.type)
2168
2169 def makefifo(self, tarinfo, targetpath):
2170 """Make a fifo called targetpath.
2171 """
2172 if hasattr(os, "mkfifo"):
2173 os.mkfifo(targetpath)
2174 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002175 raise ExtractError("fifo not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002176
2177 def makedev(self, tarinfo, targetpath):
2178 """Make a character or block device called targetpath.
2179 """
2180 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00002181 raise ExtractError("special devices not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002182
2183 mode = tarinfo.mode
2184 if tarinfo.isblk():
2185 mode |= stat.S_IFBLK
2186 else:
2187 mode |= stat.S_IFCHR
2188
2189 os.mknod(targetpath, mode,
2190 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2191
2192 def makelink(self, tarinfo, targetpath):
2193 """Make a (symbolic) link called targetpath. If it cannot be created
2194 (platform limitation), we try to make a copy of the referenced file
2195 instead of a link.
2196 """
Brian Curtind40e6f72010-07-08 21:39:08 +00002197 try:
Lars Gustäbel1b512722010-06-03 12:45:16 +00002198 # For systems that support symbolic and hard links.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002199 if tarinfo.issym():
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002200 os.symlink(tarinfo.linkname, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002201 else:
Neal Norwitza4f651a2004-07-20 22:07:44 +00002202 # See extract().
Lars Gustäbel1b512722010-06-03 12:45:16 +00002203 if os.path.exists(tarinfo._link_target):
2204 os.link(tarinfo._link_target, targetpath)
2205 else:
Brian Curtin82df53e2010-09-24 21:04:05 +00002206 self._extract_member(self._find_link_target(tarinfo),
2207 targetpath)
Brian Curtin16633fa2010-07-09 13:54:27 +00002208 except symlink_exception:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002209 try:
Brian Curtin16633fa2010-07-09 13:54:27 +00002210 self._extract_member(self._find_link_target(tarinfo),
2211 targetpath)
Lars Gustäbel1b512722010-06-03 12:45:16 +00002212 except KeyError:
2213 raise ExtractError("unable to resolve link inside archive")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002214
Eric V. Smith7a803892015-04-15 10:27:58 -04002215 def chown(self, tarinfo, targetpath, numeric_owner):
2216 """Set owner of targetpath according to tarinfo. If numeric_owner
2217 is True, use .gid/.uid instead of .gname/.uname.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002218 """
2219 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2220 # We have to be root to do so.
Eric V. Smith7a803892015-04-15 10:27:58 -04002221 if numeric_owner:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002222 g = tarinfo.gid
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002223 u = tarinfo.uid
Eric V. Smith7a803892015-04-15 10:27:58 -04002224 else:
2225 try:
2226 g = grp.getgrnam(tarinfo.gname)[2]
2227 except KeyError:
2228 g = tarinfo.gid
2229 try:
2230 u = pwd.getpwnam(tarinfo.uname)[2]
2231 except KeyError:
2232 u = tarinfo.uid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002233 try:
2234 if tarinfo.issym() and hasattr(os, "lchown"):
2235 os.lchown(targetpath, u, g)
2236 else:
Jesus Cea4791a242012-10-05 03:15:39 +02002237 os.chown(targetpath, u, g)
Andrew Svetlov3438fa42012-12-17 23:35:18 +02002238 except OSError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002239 raise ExtractError("could not change owner")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002240
2241 def chmod(self, tarinfo, targetpath):
2242 """Set file permissions of targetpath according to tarinfo.
2243 """
Jack Jansen834eff62003-03-07 12:47:06 +00002244 if hasattr(os, 'chmod'):
2245 try:
2246 os.chmod(targetpath, tarinfo.mode)
Andrew Svetlov3438fa42012-12-17 23:35:18 +02002247 except OSError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002248 raise ExtractError("could not change mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002249
2250 def utime(self, tarinfo, targetpath):
2251 """Set modification time of targetpath according to tarinfo.
2252 """
Jack Jansen834eff62003-03-07 12:47:06 +00002253 if not hasattr(os, 'utime'):
Tim Petersf9347782003-03-07 15:36:41 +00002254 return
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002255 try:
2256 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
Andrew Svetlov3438fa42012-12-17 23:35:18 +02002257 except OSError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002258 raise ExtractError("could not change modification time")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002259
2260 #--------------------------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002261 def next(self):
2262 """Return the next member of the archive as a TarInfo object, when
2263 TarFile is opened for reading. Return None if there is no more
2264 available.
2265 """
2266 self._check("ra")
2267 if self.firstmember is not None:
2268 m = self.firstmember
2269 self.firstmember = None
2270 return m
2271
Lars Gustäbel03572682015-07-06 09:27:24 +02002272 # Advance the file pointer.
2273 if self.offset != self.fileobj.tell():
2274 self.fileobj.seek(self.offset - 1)
2275 if not self.fileobj.read(1):
2276 raise ReadError("unexpected end of data")
2277
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002278 # Read the next block.
Lars Gustäbel9520a432009-11-22 18:48:49 +00002279 tarinfo = None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002280 while True:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002281 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00002282 tarinfo = self.tarinfo.fromtarfile(self)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002283 except EOFHeaderError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002284 if self.ignore_zeros:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002285 self._dbg(2, "0x%X: %s" % (self.offset, e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002286 self.offset += BLOCKSIZE
2287 continue
Lars Gustäbel9520a432009-11-22 18:48:49 +00002288 except InvalidHeaderError as e:
2289 if self.ignore_zeros:
2290 self._dbg(2, "0x%X: %s" % (self.offset, e))
2291 self.offset += BLOCKSIZE
2292 continue
2293 elif self.offset == 0:
2294 raise ReadError(str(e))
2295 except EmptyHeaderError:
2296 if self.offset == 0:
2297 raise ReadError("empty file")
2298 except TruncatedHeaderError as e:
2299 if self.offset == 0:
2300 raise ReadError(str(e))
2301 except SubsequentHeaderError as e:
2302 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002303 break
2304
Lars Gustäbel9520a432009-11-22 18:48:49 +00002305 if tarinfo is not None:
2306 self.members.append(tarinfo)
2307 else:
2308 self._loaded = True
2309
Thomas Wouters477c8d52006-05-27 19:21:47 +00002310 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002311
2312 #--------------------------------------------------------------------------
2313 # Little helper methods:
2314
Lars Gustäbel1b512722010-06-03 12:45:16 +00002315 def _getmember(self, name, tarinfo=None, normalize=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002316 """Find an archive member by name from bottom to top.
2317 If tarinfo is given, it is used as the starting point.
2318 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002319 # Ensure that all members have been loaded.
2320 members = self.getmembers()
2321
Lars Gustäbel1b512722010-06-03 12:45:16 +00002322 # Limit the member search list up to tarinfo.
2323 if tarinfo is not None:
2324 members = members[:members.index(tarinfo)]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002325
Lars Gustäbel1b512722010-06-03 12:45:16 +00002326 if normalize:
2327 name = os.path.normpath(name)
2328
2329 for member in reversed(members):
2330 if normalize:
2331 member_name = os.path.normpath(member.name)
2332 else:
2333 member_name = member.name
2334
2335 if name == member_name:
2336 return member
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002337
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002338 def _load(self):
2339 """Read through the entire archive file and look for readable
2340 members.
2341 """
2342 while True:
2343 tarinfo = self.next()
2344 if tarinfo is None:
2345 break
2346 self._loaded = True
2347
2348 def _check(self, mode=None):
2349 """Check if TarFile is still open, and if the operation's mode
2350 corresponds to TarFile's mode.
2351 """
2352 if self.closed:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02002353 raise OSError("%s is closed" % self.__class__.__name__)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002354 if mode is not None and self.mode not in mode:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02002355 raise OSError("bad operation for mode %r" % self.mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002356
Lars Gustäbel1b512722010-06-03 12:45:16 +00002357 def _find_link_target(self, tarinfo):
2358 """Find the target member of a symlink or hardlink member in the
2359 archive.
2360 """
2361 if tarinfo.issym():
2362 # Always search the entire archive.
Lars Gustäbel1ef9eda2012-04-24 21:04:40 +02002363 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
Lars Gustäbel1b512722010-06-03 12:45:16 +00002364 limit = None
2365 else:
2366 # Search the archive before the link, because a hard link is
2367 # just a reference to an already archived file.
2368 linkname = tarinfo.linkname
2369 limit = tarinfo
2370
2371 member = self._getmember(linkname, tarinfo=limit, normalize=True)
2372 if member is None:
2373 raise KeyError("linkname %r not found" % linkname)
2374 return member
2375
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002376 def __iter__(self):
2377 """Provide an iterator object.
2378 """
2379 if self._loaded:
Serhiy Storchakaa2549212015-12-19 09:43:14 +02002380 yield from self.members
2381 return
2382
2383 # Yield items using TarFile's next() method.
2384 # When all members have been read, set TarFile as _loaded.
2385 index = 0
2386 # Fix for SF #1100429: Under rare circumstances it can
2387 # happen that getmembers() is called during iteration,
2388 # which will have already exhausted the next() method.
2389 if self.firstmember is not None:
2390 tarinfo = self.next()
2391 index += 1
2392 yield tarinfo
2393
2394 while True:
2395 if index < len(self.members):
2396 tarinfo = self.members[index]
2397 elif not self._loaded:
2398 tarinfo = self.next()
2399 if not tarinfo:
2400 self._loaded = True
2401 return
2402 else:
2403 return
2404 index += 1
2405 yield tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002406
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002407 def _dbg(self, level, msg):
2408 """Write debugging output to sys.stderr.
2409 """
2410 if level <= self.debug:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002411 print(msg, file=sys.stderr)
Lars Gustäbel01385812010-03-03 12:08:54 +00002412
2413 def __enter__(self):
2414 self._check()
2415 return self
2416
2417 def __exit__(self, type, value, traceback):
2418 if type is None:
2419 self.close()
2420 else:
2421 # An exception occurred. We must not call close() because
2422 # it would try to write end-of-archive blocks and padding.
2423 if not self._extfileobj:
2424 self.fileobj.close()
2425 self.closed = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002426
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002427#--------------------
2428# exported functions
2429#--------------------
2430def is_tarfile(name):
2431 """Return True if name points to a tar archive that we
2432 are able to handle, else return False.
2433 """
2434 try:
2435 t = open(name)
2436 t.close()
2437 return True
2438 except TarError:
2439 return False
2440
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002441open = TarFile.open
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002442
2443
2444def main():
2445 import argparse
2446
2447 description = 'A simple command line interface for tarfile module.'
2448 parser = argparse.ArgumentParser(description=description)
2449 parser.add_argument('-v', '--verbose', action='store_true', default=False,
2450 help='Verbose output')
2451 group = parser.add_mutually_exclusive_group()
2452 group.add_argument('-l', '--list', metavar='<tarfile>',
2453 help='Show listing of a tarfile')
2454 group.add_argument('-e', '--extract', nargs='+',
2455 metavar=('<tarfile>', '<output_dir>'),
2456 help='Extract tarfile into target dir')
2457 group.add_argument('-c', '--create', nargs='+',
2458 metavar=('<name>', '<file>'),
2459 help='Create tarfile from sources')
2460 group.add_argument('-t', '--test', metavar='<tarfile>',
2461 help='Test if a tarfile is valid')
2462 args = parser.parse_args()
2463
2464 if args.test:
2465 src = args.test
2466 if is_tarfile(src):
2467 with open(src, 'r') as tar:
2468 tar.getmembers()
2469 print(tar.getmembers(), file=sys.stderr)
2470 if args.verbose:
2471 print('{!r} is a tar archive.'.format(src))
2472 else:
2473 parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2474
2475 elif args.list:
2476 src = args.list
2477 if is_tarfile(src):
2478 with TarFile.open(src, 'r:*') as tf:
2479 tf.list(verbose=args.verbose)
2480 else:
2481 parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2482
2483 elif args.extract:
2484 if len(args.extract) == 1:
2485 src = args.extract[0]
2486 curdir = os.curdir
2487 elif len(args.extract) == 2:
2488 src, curdir = args.extract
2489 else:
2490 parser.exit(1, parser.format_help())
2491
2492 if is_tarfile(src):
2493 with TarFile.open(src, 'r:*') as tf:
2494 tf.extractall(path=curdir)
2495 if args.verbose:
2496 if curdir == '.':
2497 msg = '{!r} file is extracted.'.format(src)
2498 else:
2499 msg = ('{!r} file is extracted '
2500 'into {!r} directory.').format(src, curdir)
2501 print(msg)
2502 else:
2503 parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2504
2505 elif args.create:
2506 tar_name = args.create.pop(0)
2507 _, ext = os.path.splitext(tar_name)
2508 compressions = {
2509 # gz
Serhiy Storchaka832dd5f2015-02-10 08:45:53 +02002510 '.gz': 'gz',
2511 '.tgz': 'gz',
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002512 # xz
Serhiy Storchaka832dd5f2015-02-10 08:45:53 +02002513 '.xz': 'xz',
2514 '.txz': 'xz',
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002515 # bz2
Serhiy Storchaka832dd5f2015-02-10 08:45:53 +02002516 '.bz2': 'bz2',
2517 '.tbz': 'bz2',
2518 '.tbz2': 'bz2',
2519 '.tb2': 'bz2',
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002520 }
2521 tar_mode = 'w:' + compressions[ext] if ext in compressions else 'w'
2522 tar_files = args.create
2523
2524 with TarFile.open(tar_name, tar_mode) as tf:
2525 for file_name in tar_files:
2526 tf.add(file_name)
2527
2528 if args.verbose:
2529 print('{!r} file created.'.format(tar_name))
2530
2531 else:
2532 parser.exit(1, parser.format_help())
2533
2534if __name__ == '__main__':
2535 main()