blob: 395c0f1d30040d02b787101028a3da4502de75f0 [file] [log] [blame]
Benjamin Peterson90f5ba52010-03-11 22:53:45 +00001#!/usr/bin/env python3
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
Christian Heimes9c1257e2007-11-04 11:37:22 +00005# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00006# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
Guido van Rossumd8faa362007-04-27 19:54:29 +000032version = "0.9.0"
Guido van Rossum98297ee2007-11-06 21:34:58 +000033__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)"
Guido van Rossum98297ee2007-11-06 21:34:58 +000034__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000035
36#---------
37# Imports
38#---------
Serhiy Storchakacf4a2f22015-03-11 17:18:03 +020039from builtins import open as bltn_open
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000040import sys
41import os
Eli Bendersky74c503b2012-01-03 06:26:13 +020042import io
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000043import shutil
44import stat
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000045import time
46import struct
Thomas Wouters89f507f2006-12-13 04:49:30 +000047import copy
Guido van Rossumd8faa362007-04-27 19:54:29 +000048import re
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000049
50try:
Xavier de Gayef44abda2016-12-09 09:33:09 +010051 import pwd
Brett Cannoncd171c82013-07-04 17:43:24 -040052except ImportError:
Xavier de Gayef44abda2016-12-09 09:33:09 +010053 pwd = None
54try:
55 import grp
56except ImportError:
57 grp = None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000058
Brian Curtin16633fa2010-07-09 13:54:27 +000059# os.symlink on Windows prior to 6.0 raises NotImplementedError
60symlink_exception = (AttributeError, NotImplementedError)
61try:
Andrew Svetlov2606a6f2012-12-19 14:33:35 +020062 # OSError (winerror=1314) will be raised if the caller does not hold the
Brian Curtin16633fa2010-07-09 13:54:27 +000063 # SeCreateSymbolicLinkPrivilege privilege
Andrew Svetlov2606a6f2012-12-19 14:33:35 +020064 symlink_exception += (OSError,)
Brian Curtin16633fa2010-07-09 13:54:27 +000065except NameError:
66 pass
67
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000068# from tarfile import *
Martin Panter104dcda2016-01-16 06:59:13 +000069__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError", "ReadError",
70 "CompressionError", "StreamError", "ExtractError", "HeaderError",
71 "ENCODING", "USTAR_FORMAT", "GNU_FORMAT", "PAX_FORMAT",
72 "DEFAULT_FORMAT", "open"]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000073
74#---------------------------------------------------------
75# tar constants
76#---------------------------------------------------------
Lars Gustäbelb506dc32007-08-07 18:36:16 +000077NUL = b"\0" # the null character
Guido van Rossumd8faa362007-04-27 19:54:29 +000078BLOCKSIZE = 512 # length of processing blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000079RECORDSIZE = BLOCKSIZE * 20 # length of records
Lars Gustäbelb506dc32007-08-07 18:36:16 +000080GNU_MAGIC = b"ustar \0" # magic gnu tar string
81POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000082
Guido van Rossumd8faa362007-04-27 19:54:29 +000083LENGTH_NAME = 100 # maximum length of a filename
84LENGTH_LINK = 100 # maximum length of a linkname
85LENGTH_PREFIX = 155 # maximum length of the prefix field
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000086
Lars Gustäbelb506dc32007-08-07 18:36:16 +000087REGTYPE = b"0" # regular file
88AREGTYPE = b"\0" # regular file
89LNKTYPE = b"1" # link (inside tarfile)
90SYMTYPE = b"2" # symbolic link
91CHRTYPE = b"3" # character special device
92BLKTYPE = b"4" # block special device
93DIRTYPE = b"5" # directory
94FIFOTYPE = b"6" # fifo special device
95CONTTYPE = b"7" # contiguous file
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000096
Lars Gustäbelb506dc32007-08-07 18:36:16 +000097GNUTYPE_LONGNAME = b"L" # GNU tar longname
98GNUTYPE_LONGLINK = b"K" # GNU tar longlink
99GNUTYPE_SPARSE = b"S" # GNU tar sparse file
Guido van Rossumd8faa362007-04-27 19:54:29 +0000100
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000101XHDTYPE = b"x" # POSIX.1-2001 extended header
102XGLTYPE = b"g" # POSIX.1-2001 global header
103SOLARIS_XHDTYPE = b"X" # Solaris extended header
Guido van Rossumd8faa362007-04-27 19:54:29 +0000104
105USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
106GNU_FORMAT = 1 # GNU tar format
107PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
CAM Gerlache680c3d2019-03-21 09:44:51 -0500108DEFAULT_FORMAT = PAX_FORMAT
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000109
110#---------------------------------------------------------
111# tarfile constants
112#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000113# File types that tarfile supports:
114SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
115 SYMTYPE, DIRTYPE, FIFOTYPE,
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000116 CONTTYPE, CHRTYPE, BLKTYPE,
117 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
118 GNUTYPE_SPARSE)
119
Guido van Rossumd8faa362007-04-27 19:54:29 +0000120# File types that will be treated as a regular file.
121REGULAR_TYPES = (REGTYPE, AREGTYPE,
122 CONTTYPE, GNUTYPE_SPARSE)
123
124# File types that are part of the GNU tar format.
125GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
126 GNUTYPE_SPARSE)
127
128# Fields from a pax header that override a TarInfo attribute.
129PAX_FIELDS = ("path", "linkpath", "size", "mtime",
130 "uid", "gid", "uname", "gname")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000131
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000132# Fields from a pax header that are affected by hdrcharset.
133PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
134
Guido van Rossume7ba4952007-06-06 23:52:48 +0000135# Fields in a pax header that are numbers, all other fields
136# are treated as strings.
137PAX_NUMBER_FIELDS = {
138 "atime": float,
139 "ctime": float,
140 "mtime": float,
141 "uid": int,
142 "gid": int,
143 "size": int
144}
145
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000146#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000147# initialization
148#---------------------------------------------------------
Larry Hastings10108a72016-09-05 15:11:23 -0700149if os.name == "nt":
Victor Stinner0f35e2c2010-06-11 23:46:47 +0000150 ENCODING = "utf-8"
151else:
152 ENCODING = sys.getfilesystemencoding()
Guido van Rossumd8faa362007-04-27 19:54:29 +0000153
154#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000155# Some useful functions
156#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000157
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000158def stn(s, length, encoding, errors):
159 """Convert a string to a null-terminated bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000160 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000161 s = s.encode(encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000162 return s[:length] + (length - len(s)) * NUL
Thomas Wouters477c8d52006-05-27 19:21:47 +0000163
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000164def nts(s, encoding, errors):
165 """Convert a null-terminated bytes object to a string.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000166 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000167 p = s.find(b"\0")
168 if p != -1:
169 s = s[:p]
170 return s.decode(encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000171
Thomas Wouters477c8d52006-05-27 19:21:47 +0000172def nti(s):
173 """Convert a number field to a python number.
174 """
175 # There are two possible encodings for a number field, see
176 # itn() below.
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200177 if s[0] in (0o200, 0o377):
178 n = 0
179 for i in range(len(s) - 1):
180 n <<= 8
181 n += s[i + 1]
182 if s[0] == 0o377:
183 n = -(256 ** (len(s) - 1) - n)
184 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000185 try:
Lars Gustäbelb7a688b2015-07-02 19:38:38 +0200186 s = nts(s, "ascii", "strict")
187 n = int(s.strip() or "0", 8)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000188 except ValueError:
Lars Gustäbel9520a432009-11-22 18:48:49 +0000189 raise InvalidHeaderError("invalid header")
Thomas Wouters477c8d52006-05-27 19:21:47 +0000190 return n
191
Guido van Rossumd8faa362007-04-27 19:54:29 +0000192def itn(n, digits=8, format=DEFAULT_FORMAT):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 """Convert a python number to a number field.
194 """
195 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
196 # octal digits followed by a null-byte, this allows values up to
197 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200198 # that if necessary. A leading 0o200 or 0o377 byte indicate this
199 # particular encoding, the following digits-1 bytes are a big-endian
200 # base-256 representation. This allows values up to (256**(digits-1))-1.
201 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
202 # number.
Ethan Furmanb5a6db92020-12-12 13:26:44 -0800203 original_n = n
Joffrey F72d9b2b2018-02-26 16:02:21 -0800204 n = int(n)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000205 if 0 <= n < 8 ** (digits - 1):
Joffrey F72d9b2b2018-02-26 16:02:21 -0800206 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200207 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
208 if n >= 0:
209 s = bytearray([0o200])
210 else:
211 s = bytearray([0o377])
212 n = 256 ** digits + n
Thomas Wouters477c8d52006-05-27 19:21:47 +0000213
Guido van Rossum805365e2007-05-07 22:24:25 +0000214 for i in range(digits - 1):
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200215 s.insert(1, n & 0o377)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000216 n >>= 8
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200217 else:
218 raise ValueError("overflow in number field")
219
Thomas Wouters477c8d52006-05-27 19:21:47 +0000220 return s
221
222def calc_chksums(buf):
223 """Calculate the checksum for a member's header by summing up all
224 characters except for the chksum field which is treated as if
225 it was filled with spaces. According to the GNU tar sources,
226 some tars (Sun and NeXT) calculate chksum with signed char,
227 which will be different if there are chars in the buffer with
228 the high bit set. So we calculate two checksums, unsigned and
229 signed.
230 """
Ross Lagerwall468ff4c2012-05-17 19:49:27 +0200231 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
232 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233 return unsigned_chksum, signed_chksum
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000234
Łukasz Langa04bedfa2016-09-09 19:48:14 -0700235def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000236 """Copy length bytes from fileobj src to fileobj dst.
237 If length is None, copy the entire content.
238 """
Łukasz Langa04bedfa2016-09-09 19:48:14 -0700239 bufsize = bufsize or 16 * 1024
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000240 if length == 0:
241 return
242 if length is None:
Łukasz Langa04bedfa2016-09-09 19:48:14 -0700243 shutil.copyfileobj(src, dst, bufsize)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000244 return
245
Łukasz Langa04bedfa2016-09-09 19:48:14 -0700246 blocks, remainder = divmod(length, bufsize)
Guido van Rossum805365e2007-05-07 22:24:25 +0000247 for b in range(blocks):
Łukasz Langa04bedfa2016-09-09 19:48:14 -0700248 buf = src.read(bufsize)
249 if len(buf) < bufsize:
Lars Gustäbel03572682015-07-06 09:27:24 +0200250 raise exception("unexpected end of data")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000251 dst.write(buf)
252
253 if remainder != 0:
254 buf = src.read(remainder)
255 if len(buf) < remainder:
Lars Gustäbel03572682015-07-06 09:27:24 +0200256 raise exception("unexpected end of data")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000257 dst.write(buf)
258 return
259
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +0200260def _safe_print(s):
261 encoding = getattr(sys.stdout, 'encoding', None)
262 if encoding is not None:
263 s = s.encode(encoding, 'backslashreplace').decode(encoding)
264 print(s, end=' ')
265
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000266
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000267class TarError(Exception):
268 """Base exception."""
269 pass
270class ExtractError(TarError):
271 """General exception for extract errors."""
272 pass
273class ReadError(TarError):
Ezio Melotti30b9d5d2013-08-17 15:50:46 +0300274 """Exception for unreadable tar archives."""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000275 pass
276class CompressionError(TarError):
277 """Exception for unavailable compression methods."""
278 pass
279class StreamError(TarError):
280 """Exception for unsupported operations on stream-like TarFiles."""
281 pass
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000282class HeaderError(TarError):
Lars Gustäbel9520a432009-11-22 18:48:49 +0000283 """Base exception for header errors."""
284 pass
285class EmptyHeaderError(HeaderError):
286 """Exception for empty headers."""
287 pass
288class TruncatedHeaderError(HeaderError):
289 """Exception for truncated headers."""
290 pass
291class EOFHeaderError(HeaderError):
292 """Exception for end of file headers."""
293 pass
294class InvalidHeaderError(HeaderError):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000295 """Exception for invalid headers."""
296 pass
Lars Gustäbel9520a432009-11-22 18:48:49 +0000297class SubsequentHeaderError(HeaderError):
298 """Exception for missing and invalid extended headers."""
299 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000300
301#---------------------------
302# internal stream interface
303#---------------------------
304class _LowLevelFile:
305 """Low-level file object. Supports reading and writing.
306 It is used instead of a regular file object for streaming
307 access.
308 """
309
310 def __init__(self, name, mode):
311 mode = {
312 "r": os.O_RDONLY,
313 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
314 }[mode]
315 if hasattr(os, "O_BINARY"):
316 mode |= os.O_BINARY
Lars Gustäbeld6eb70b2010-04-29 15:37:02 +0000317 self.fd = os.open(name, mode, 0o666)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000318
319 def close(self):
320 os.close(self.fd)
321
322 def read(self, size):
323 return os.read(self.fd, size)
324
325 def write(self, s):
326 os.write(self.fd, s)
327
328class _Stream:
329 """Class that serves as an adapter between TarFile and
330 a stream-like object. The stream-like object only
331 needs to have a read() or write() method and is accessed
332 blockwise. Use of gzip or bzip2 compression is possible.
333 A stream-like object could be for example: sys.stdin,
334 sys.stdout, a socket, a tape device etc.
335
336 _Stream is intended to be used only internally.
337 """
338
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000339 def __init__(self, name, mode, comptype, fileobj, bufsize):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000340 """Construct a _Stream object.
341 """
342 self._extfileobj = True
343 if fileobj is None:
344 fileobj = _LowLevelFile(name, mode)
345 self._extfileobj = False
346
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000347 if comptype == '*':
348 # Enable transparent compression detection for the
349 # stream interface
350 fileobj = _StreamProxy(fileobj)
351 comptype = fileobj.getcomptype()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000352
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000353 self.name = name or ""
354 self.mode = mode
355 self.comptype = comptype
356 self.fileobj = fileobj
357 self.bufsize = bufsize
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000358 self.buf = b""
Guido van Rossume2a383d2007-01-15 16:59:06 +0000359 self.pos = 0
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000360 self.closed = False
361
Antoine Pitrou605c2932010-09-23 20:15:14 +0000362 try:
363 if comptype == "gz":
364 try:
365 import zlib
Brett Cannoncd171c82013-07-04 17:43:24 -0400366 except ImportError:
Ethan Furmanb5a6db92020-12-12 13:26:44 -0800367 raise CompressionError("zlib module is not available") from None
Antoine Pitrou605c2932010-09-23 20:15:14 +0000368 self.zlib = zlib
369 self.crc = zlib.crc32(b"")
370 if mode == "r":
371 self._init_read_gz()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100372 self.exception = zlib.error
Antoine Pitrou605c2932010-09-23 20:15:14 +0000373 else:
374 self._init_write_gz()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000375
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100376 elif comptype == "bz2":
Antoine Pitrou605c2932010-09-23 20:15:14 +0000377 try:
378 import bz2
Brett Cannoncd171c82013-07-04 17:43:24 -0400379 except ImportError:
Ethan Furmanb5a6db92020-12-12 13:26:44 -0800380 raise CompressionError("bz2 module is not available") from None
Antoine Pitrou605c2932010-09-23 20:15:14 +0000381 if mode == "r":
382 self.dbuf = b""
383 self.cmp = bz2.BZ2Decompressor()
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200384 self.exception = OSError
Antoine Pitrou605c2932010-09-23 20:15:14 +0000385 else:
386 self.cmp = bz2.BZ2Compressor()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100387
388 elif comptype == "xz":
389 try:
390 import lzma
Brett Cannoncd171c82013-07-04 17:43:24 -0400391 except ImportError:
Ethan Furmanb5a6db92020-12-12 13:26:44 -0800392 raise CompressionError("lzma module is not available") from None
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100393 if mode == "r":
394 self.dbuf = b""
395 self.cmp = lzma.LZMADecompressor()
396 self.exception = lzma.LZMAError
397 else:
398 self.cmp = lzma.LZMACompressor()
399
400 elif comptype != "tar":
401 raise CompressionError("unknown compression type %r" % comptype)
402
Antoine Pitrou605c2932010-09-23 20:15:14 +0000403 except:
404 if not self._extfileobj:
405 self.fileobj.close()
406 self.closed = True
407 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000408
409 def __del__(self):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000410 if hasattr(self, "closed") and not self.closed:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000411 self.close()
412
413 def _init_write_gz(self):
414 """Initialize for writing with gzip compression.
415 """
416 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
417 -self.zlib.MAX_WBITS,
418 self.zlib.DEF_MEM_LEVEL,
419 0)
Guido van Rossume2a383d2007-01-15 16:59:06 +0000420 timestamp = struct.pack("<L", int(time.time()))
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000421 self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000422 if self.name.endswith(".gz"):
423 self.name = self.name[:-3]
Artem Bulgakov22748a82020-09-07 19:46:33 +0300424 # Honor "directory components removed" from RFC1952
425 self.name = os.path.basename(self.name)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000426 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
427 self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000428
429 def write(self, s):
430 """Write string s to the stream.
431 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000432 if self.comptype == "gz":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000433 self.crc = self.zlib.crc32(s, self.crc)
434 self.pos += len(s)
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000435 if self.comptype != "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000436 s = self.cmp.compress(s)
437 self.__write(s)
438
439 def __write(self, s):
440 """Write string s to the stream if a whole new block
441 is ready to be written.
442 """
443 self.buf += s
444 while len(self.buf) > self.bufsize:
445 self.fileobj.write(self.buf[:self.bufsize])
446 self.buf = self.buf[self.bufsize:]
447
448 def close(self):
449 """Close the _Stream object. No operation should be
450 done on it afterwards.
451 """
452 if self.closed:
453 return
454
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000455 self.closed = True
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300456 try:
457 if self.mode == "w" and self.comptype != "tar":
458 self.buf += self.cmp.flush()
459
460 if self.mode == "w" and self.buf:
461 self.fileobj.write(self.buf)
462 self.buf = b""
463 if self.comptype == "gz":
Martin Panterb82032f2015-12-11 05:19:29 +0000464 self.fileobj.write(struct.pack("<L", self.crc))
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300465 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
466 finally:
467 if not self._extfileobj:
468 self.fileobj.close()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000469
470 def _init_read_gz(self):
471 """Initialize for reading a gzip compressed fileobj.
472 """
473 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000474 self.dbuf = b""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000475
476 # taken from gzip.GzipFile with some alterations
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000477 if self.__read(2) != b"\037\213":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000478 raise ReadError("not a gzip file")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000479 if self.__read(1) != b"\010":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000480 raise CompressionError("unsupported compression method")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000481
482 flag = ord(self.__read(1))
483 self.__read(6)
484
485 if flag & 4:
486 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
487 self.read(xlen)
488 if flag & 8:
489 while True:
490 s = self.__read(1)
491 if not s or s == NUL:
492 break
493 if flag & 16:
494 while True:
495 s = self.__read(1)
496 if not s or s == NUL:
497 break
498 if flag & 2:
499 self.__read(2)
500
501 def tell(self):
502 """Return the stream's file pointer position.
503 """
504 return self.pos
505
506 def seek(self, pos=0):
507 """Set the stream's file pointer to pos. Negative seeking
508 is forbidden.
509 """
510 if pos - self.pos >= 0:
511 blocks, remainder = divmod(pos - self.pos, self.bufsize)
Guido van Rossum805365e2007-05-07 22:24:25 +0000512 for i in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000513 self.read(self.bufsize)
514 self.read(remainder)
515 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000516 raise StreamError("seeking backwards is not allowed")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000517 return self.pos
518
INADA Naoki8d130912018-07-06 14:06:00 +0900519 def read(self, size):
520 """Return the next size number of bytes from the stream."""
521 assert size is not None
522 buf = self._read(size)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000523 self.pos += len(buf)
524 return buf
525
526 def _read(self, size):
527 """Return size bytes from the stream.
528 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000529 if self.comptype == "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000530 return self.__read(size)
531
532 c = len(self.dbuf)
hajoscher12a08c42018-07-04 10:13:18 +0200533 t = [self.dbuf]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000534 while c < size:
INADA Naoki8d130912018-07-06 14:06:00 +0900535 # Skip underlying buffer to avoid unaligned double buffering.
536 if self.buf:
537 buf = self.buf
538 self.buf = b""
539 else:
540 buf = self.fileobj.read(self.bufsize)
541 if not buf:
542 break
Guido van Rossumd8faa362007-04-27 19:54:29 +0000543 try:
544 buf = self.cmp.decompress(buf)
Ethan Furmanb5a6db92020-12-12 13:26:44 -0800545 except self.exception as e:
546 raise ReadError("invalid compressed data") from e
hajoscher12a08c42018-07-04 10:13:18 +0200547 t.append(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000548 c += len(buf)
hajoscher12a08c42018-07-04 10:13:18 +0200549 t = b"".join(t)
550 self.dbuf = t[size:]
551 return t[:size]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000552
553 def __read(self, size):
554 """Return size bytes from stream. If internal buffer is empty,
555 read another block from the stream.
556 """
557 c = len(self.buf)
hajoscher12a08c42018-07-04 10:13:18 +0200558 t = [self.buf]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000559 while c < size:
560 buf = self.fileobj.read(self.bufsize)
561 if not buf:
562 break
hajoscher12a08c42018-07-04 10:13:18 +0200563 t.append(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000564 c += len(buf)
hajoscher12a08c42018-07-04 10:13:18 +0200565 t = b"".join(t)
566 self.buf = t[size:]
567 return t[:size]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000568# class _Stream
569
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000570class _StreamProxy(object):
571 """Small proxy class that enables transparent compression
572 detection for the Stream interface (mode 'r|*').
573 """
574
575 def __init__(self, fileobj):
576 self.fileobj = fileobj
577 self.buf = self.fileobj.read(BLOCKSIZE)
578
579 def read(self, size):
580 self.read = self.fileobj.read
581 return self.buf
582
583 def getcomptype(self):
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100584 if self.buf.startswith(b"\x1f\x8b\x08"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000585 return "gz"
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100586 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000587 return "bz2"
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100588 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
589 return "xz"
590 else:
591 return "tar"
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000592
593 def close(self):
594 self.fileobj.close()
595# class StreamProxy
596
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000597#------------------------
598# Extraction file object
599#------------------------
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000600class _FileInFile(object):
601 """A thin wrapper around an existing file object that
602 provides a part of its data as an individual file
603 object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000604 """
605
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000606 def __init__(self, fileobj, offset, size, blockinfo=None):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000607 self.fileobj = fileobj
608 self.offset = offset
609 self.size = size
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000610 self.position = 0
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200611 self.name = getattr(fileobj, "name", None)
612 self.closed = False
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000613
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000614 if blockinfo is None:
615 blockinfo = [(0, size)]
616
617 # Construct a map with data and zero blocks.
618 self.map_index = 0
619 self.map = []
620 lastpos = 0
621 realpos = self.offset
622 for offset, size in blockinfo:
623 if offset > lastpos:
624 self.map.append((False, lastpos, offset, None))
625 self.map.append((True, offset, offset + size, realpos))
626 realpos += size
627 lastpos = offset + size
628 if lastpos < self.size:
629 self.map.append((False, lastpos, self.size, None))
630
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200631 def flush(self):
632 pass
633
634 def readable(self):
635 return True
636
637 def writable(self):
638 return False
639
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000640 def seekable(self):
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000641 return self.fileobj.seekable()
642
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000643 def tell(self):
644 """Return the current file position.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000645 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000646 return self.position
647
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200648 def seek(self, position, whence=io.SEEK_SET):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000649 """Seek to a position in the file.
650 """
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200651 if whence == io.SEEK_SET:
652 self.position = min(max(position, 0), self.size)
653 elif whence == io.SEEK_CUR:
654 if position < 0:
655 self.position = max(self.position + position, 0)
656 else:
657 self.position = min(self.position + position, self.size)
658 elif whence == io.SEEK_END:
659 self.position = max(min(self.size + position, self.size), 0)
660 else:
661 raise ValueError("Invalid argument")
662 return self.position
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000663
664 def read(self, size=None):
665 """Read data from the file.
666 """
667 if size is None:
668 size = self.size - self.position
669 else:
670 size = min(size, self.size - self.position)
671
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000672 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000673 while size > 0:
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000674 while True:
675 data, start, stop, offset = self.map[self.map_index]
676 if start <= self.position < stop:
677 break
678 else:
679 self.map_index += 1
680 if self.map_index == len(self.map):
681 self.map_index = 0
682 length = min(size, stop - self.position)
683 if data:
Lars Gustäbeldd071042011-02-23 11:42:22 +0000684 self.fileobj.seek(offset + (self.position - start))
Lars Gustäbel03572682015-07-06 09:27:24 +0200685 b = self.fileobj.read(length)
686 if len(b) != length:
687 raise ReadError("unexpected end of data")
688 buf += b
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000689 else:
690 buf += NUL * length
691 size -= length
692 self.position += length
693 return buf
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000694
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200695 def readinto(self, b):
696 buf = self.read(len(b))
697 b[:len(buf)] = buf
698 return len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000699
700 def close(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000701 self.closed = True
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200702#class _FileInFile
Martin v. Löwisdf241532005-03-03 08:17:42 +0000703
Lars Gustäbelb062a2f2012-05-14 13:18:16 +0200704class ExFileObject(io.BufferedReader):
705
706 def __init__(self, tarfile, tarinfo):
707 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
708 tarinfo.size, tarinfo.sparse)
709 super().__init__(fileobj)
710#class ExFileObject
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000711
712#------------------
713# Exported Classes
714#------------------
715class TarInfo(object):
716 """Informational class which holds the details about an
717 archive member given by a tar header block.
718 TarInfo objects are returned by TarFile.getmember(),
719 TarFile.getmembers() and TarFile.gettarinfo() and are
720 usually created internally.
721 """
722
Raymond Hettingera694f232019-03-27 13:16:34 -0700723 __slots__ = dict(
724 name = 'Name of the archive member.',
725 mode = 'Permission bits.',
726 uid = 'User ID of the user who originally stored this member.',
727 gid = 'Group ID of the user who originally stored this member.',
728 size = 'Size in bytes.',
729 mtime = 'Time of last modification.',
730 chksum = 'Header checksum.',
731 type = ('File type. type is usually one of these constants: '
732 'REGTYPE, AREGTYPE, LNKTYPE, SYMTYPE, DIRTYPE, FIFOTYPE, '
733 'CONTTYPE, CHRTYPE, BLKTYPE, GNUTYPE_SPARSE.'),
734 linkname = ('Name of the target file name, which is only present '
735 'in TarInfo objects of type LNKTYPE and SYMTYPE.'),
736 uname = 'User name.',
737 gname = 'Group name.',
738 devmajor = 'Device major number.',
739 devminor = 'Device minor number.',
740 offset = 'The tar header starts here.',
741 offset_data = "The file's data starts here.",
742 pax_headers = ('A dictionary containing key-value pairs of an '
743 'associated pax extended header.'),
744 sparse = 'Sparse member information.',
745 tarfile = None,
746 _sparse_structs = None,
747 _link_target = None,
748 )
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000749
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000750 def __init__(self, name=""):
751 """Construct a TarInfo object. name is the optional name
752 of the member.
753 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000754 self.name = name # member name
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000755 self.mode = 0o644 # file permissions
Thomas Wouters477c8d52006-05-27 19:21:47 +0000756 self.uid = 0 # user id
757 self.gid = 0 # group id
758 self.size = 0 # file size
759 self.mtime = 0 # modification time
760 self.chksum = 0 # header checksum
761 self.type = REGTYPE # member type
762 self.linkname = "" # link name
Lars Gustäbel331b8002010-10-04 15:18:47 +0000763 self.uname = "" # user name
764 self.gname = "" # group name
Thomas Wouters477c8d52006-05-27 19:21:47 +0000765 self.devmajor = 0 # device major number
766 self.devminor = 0 # device minor number
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000767
Thomas Wouters477c8d52006-05-27 19:21:47 +0000768 self.offset = 0 # the tar header starts here
769 self.offset_data = 0 # the file's data starts here
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000770
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000771 self.sparse = None # sparse member information
Guido van Rossumd8faa362007-04-27 19:54:29 +0000772 self.pax_headers = {} # pax header information
773
Serhiy Storchakabdf6b912017-03-19 08:40:32 +0200774 @property
775 def path(self):
Raymond Hettingera694f232019-03-27 13:16:34 -0700776 'In pax headers, "name" is called "path".'
Guido van Rossumd8faa362007-04-27 19:54:29 +0000777 return self.name
Guido van Rossumd8faa362007-04-27 19:54:29 +0000778
Serhiy Storchakabdf6b912017-03-19 08:40:32 +0200779 @path.setter
780 def path(self, name):
781 self.name = name
782
783 @property
784 def linkpath(self):
Raymond Hettingera694f232019-03-27 13:16:34 -0700785 'In pax headers, "linkname" is called "linkpath".'
Guido van Rossumd8faa362007-04-27 19:54:29 +0000786 return self.linkname
Serhiy Storchakabdf6b912017-03-19 08:40:32 +0200787
788 @linkpath.setter
789 def linkpath(self, linkname):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000790 self.linkname = linkname
Guido van Rossumd8faa362007-04-27 19:54:29 +0000791
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000792 def __repr__(self):
793 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
794
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000795 def get_info(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000796 """Return the TarInfo's attributes as a dictionary.
797 """
798 info = {
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000799 "name": self.name,
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000800 "mode": self.mode & 0o7777,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000801 "uid": self.uid,
802 "gid": self.gid,
803 "size": self.size,
804 "mtime": self.mtime,
805 "chksum": self.chksum,
806 "type": self.type,
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000807 "linkname": self.linkname,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000808 "uname": self.uname,
809 "gname": self.gname,
810 "devmajor": self.devmajor,
811 "devminor": self.devminor
812 }
813
814 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
815 info["name"] += "/"
816
817 return info
818
Victor Stinnerde629d42010-05-05 21:43:57 +0000819 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000820 """Return a tar header as a string of 512 byte blocks.
821 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000822 info = self.get_info()
Guido van Rossume7ba4952007-06-06 23:52:48 +0000823
Guido van Rossumd8faa362007-04-27 19:54:29 +0000824 if format == USTAR_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000825 return self.create_ustar_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000826 elif format == GNU_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000827 return self.create_gnu_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000828 elif format == PAX_FORMAT:
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000829 return self.create_pax_header(info, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000830 else:
831 raise ValueError("invalid format")
832
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000833 def create_ustar_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000834 """Return the object as a ustar header block.
835 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000836 info["magic"] = POSIX_MAGIC
837
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200838 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000839 raise ValueError("linkname is too long")
840
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200841 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
842 info["prefix"], info["name"] = self._posix_split_name(info["name"], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000843
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000844 return self._create_header(info, USTAR_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000845
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000846 def create_gnu_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000847 """Return the object as a GNU header block sequence.
848 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000849 info["magic"] = GNU_MAGIC
850
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000851 buf = b""
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200852 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000853 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000854
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200855 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000856 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000857
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000858 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000859
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000860 def create_pax_header(self, info, encoding):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000861 """Return the object as a ustar header block. If it cannot be
862 represented this way, prepend a pax extended header sequence
863 with supplement information.
864 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000865 info["magic"] = POSIX_MAGIC
866 pax_headers = self.pax_headers.copy()
867
868 # Test string fields for values that exceed the field length or cannot
869 # be represented in ASCII encoding.
870 for name, hname, length in (
871 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
872 ("uname", "uname", 32), ("gname", "gname", 32)):
873
Guido van Rossume7ba4952007-06-06 23:52:48 +0000874 if hname in pax_headers:
875 # The pax header has priority.
876 continue
877
Guido van Rossumd8faa362007-04-27 19:54:29 +0000878 # Try to encode the string as ASCII.
879 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000880 info[name].encode("ascii", "strict")
Guido van Rossumd8faa362007-04-27 19:54:29 +0000881 except UnicodeEncodeError:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000882 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +0000883 continue
884
Guido van Rossume7ba4952007-06-06 23:52:48 +0000885 if len(info[name]) > length:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000886 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +0000887
888 # Test number fields for values that exceed the field limit or values
889 # that like to be stored as float.
890 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
Guido van Rossume7ba4952007-06-06 23:52:48 +0000891 if name in pax_headers:
892 # The pax header has priority. Avoid overflow.
893 info[name] = 0
894 continue
895
Guido van Rossumd8faa362007-04-27 19:54:29 +0000896 val = info[name]
897 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000898 pax_headers[name] = str(val)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000899 info[name] = 0
900
Guido van Rossume7ba4952007-06-06 23:52:48 +0000901 # Create a pax extended header if necessary.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000902 if pax_headers:
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000903 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000904 else:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000905 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +0000906
Lars Gustäbel3741eff2007-08-21 12:17:05 +0000907 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
Guido van Rossumd8faa362007-04-27 19:54:29 +0000908
909 @classmethod
Lars Gustäbel3741eff2007-08-21 12:17:05 +0000910 def create_pax_global_header(cls, pax_headers):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000911 """Return the object as a pax global header block sequence.
912 """
Marc-André Lemburg8f36af72011-02-25 15:42:01 +0000913 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
Guido van Rossumd8faa362007-04-27 19:54:29 +0000914
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200915 def _posix_split_name(self, name, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000916 """Split a name longer than 100 chars into a prefix
917 and a name part.
918 """
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200919 components = name.split("/")
920 for i in range(1, len(components)):
921 prefix = "/".join(components[:i])
922 name = "/".join(components[i:])
923 if len(prefix.encode(encoding, errors)) <= LENGTH_PREFIX and \
924 len(name.encode(encoding, errors)) <= LENGTH_NAME:
925 break
926 else:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000927 raise ValueError("name is too long")
Lars Gustäbel0f450ab2016-04-19 08:43:17 +0200928
Guido van Rossumd8faa362007-04-27 19:54:29 +0000929 return prefix, name
930
931 @staticmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000932 def _create_header(info, format, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000933 """Return a header block. info is a dictionary with file
934 information, format must be one of the *_FORMAT constants.
935 """
William Chargin674935b2020-02-12 11:56:02 -0800936 has_device_fields = info.get("type") in (CHRTYPE, BLKTYPE)
937 if has_device_fields:
938 devmajor = itn(info.get("devmajor", 0), 8, format)
939 devminor = itn(info.get("devminor", 0), 8, format)
940 else:
941 devmajor = stn("", 8, encoding, errors)
942 devminor = stn("", 8, encoding, errors)
943
Guido van Rossumd8faa362007-04-27 19:54:29 +0000944 parts = [
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000945 stn(info.get("name", ""), 100, encoding, errors),
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000946 itn(info.get("mode", 0) & 0o7777, 8, format),
Guido van Rossumd8faa362007-04-27 19:54:29 +0000947 itn(info.get("uid", 0), 8, format),
948 itn(info.get("gid", 0), 8, format),
949 itn(info.get("size", 0), 12, format),
950 itn(info.get("mtime", 0), 12, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000951 b" ", # checksum field
Guido van Rossumd8faa362007-04-27 19:54:29 +0000952 info.get("type", REGTYPE),
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000953 stn(info.get("linkname", ""), 100, encoding, errors),
954 info.get("magic", POSIX_MAGIC),
Lars Gustäbel331b8002010-10-04 15:18:47 +0000955 stn(info.get("uname", ""), 32, encoding, errors),
956 stn(info.get("gname", ""), 32, encoding, errors),
William Chargin674935b2020-02-12 11:56:02 -0800957 devmajor,
958 devminor,
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000959 stn(info.get("prefix", ""), 155, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000960 ]
961
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000962 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
Guido van Rossumd8faa362007-04-27 19:54:29 +0000963 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
Lars Gustäbela280ca752007-08-28 07:34:33 +0000964 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
Guido van Rossumd8faa362007-04-27 19:54:29 +0000965 return buf
966
967 @staticmethod
968 def _create_payload(payload):
969 """Return the string payload filled with zero bytes
970 up to the next 512 byte border.
971 """
972 blocks, remainder = divmod(len(payload), BLOCKSIZE)
973 if remainder > 0:
974 payload += (BLOCKSIZE - remainder) * NUL
975 return payload
976
977 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000978 def _create_gnu_long_header(cls, name, type, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000979 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
980 for name.
981 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000982 name = name.encode(encoding, errors) + NUL
Guido van Rossumd8faa362007-04-27 19:54:29 +0000983
984 info = {}
985 info["name"] = "././@LongLink"
986 info["type"] = type
987 info["size"] = len(name)
988 info["magic"] = GNU_MAGIC
989
990 # create extended header + name blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000991 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
Guido van Rossumd8faa362007-04-27 19:54:29 +0000992 cls._create_payload(name)
993
994 @classmethod
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000995 def _create_pax_generic_header(cls, pax_headers, type, encoding):
996 """Return a POSIX.1-2008 extended or global header sequence
Guido van Rossumd8faa362007-04-27 19:54:29 +0000997 that contains a list of keyword, value pairs. The values
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000998 must be strings.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000999 """
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001000 # Check if one of the fields contains surrogate characters and thereby
1001 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1002 binary = False
1003 for keyword, value in pax_headers.items():
1004 try:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001005 value.encode("utf-8", "strict")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001006 except UnicodeEncodeError:
1007 binary = True
1008 break
1009
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001010 records = b""
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001011 if binary:
1012 # Put the hdrcharset field at the beginning of the header.
1013 records += b"21 hdrcharset=BINARY\n"
1014
Guido van Rossumd8faa362007-04-27 19:54:29 +00001015 for keyword, value in pax_headers.items():
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001016 keyword = keyword.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001017 if binary:
1018 # Try to restore the original byte representation of `value'.
1019 # Needless to say, that the encoding must match the string.
1020 value = value.encode(encoding, "surrogateescape")
1021 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001022 value = value.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001023
Guido van Rossumd8faa362007-04-27 19:54:29 +00001024 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1025 n = p = 0
1026 while True:
1027 n = l + len(str(p))
1028 if n == p:
1029 break
1030 p = n
Lars Gustäbela280ca752007-08-28 07:34:33 +00001031 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
Guido van Rossumd8faa362007-04-27 19:54:29 +00001032
1033 # We use a hardcoded "././@PaxHeader" name like star does
1034 # instead of the one that POSIX recommends.
1035 info = {}
1036 info["name"] = "././@PaxHeader"
1037 info["type"] = type
1038 info["size"] = len(records)
1039 info["magic"] = POSIX_MAGIC
1040
1041 # Create pax header + record blocks.
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001042 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001043 cls._create_payload(records)
1044
Guido van Rossum75b64e62005-01-16 00:16:11 +00001045 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001046 def frombuf(cls, buf, encoding, errors):
1047 """Construct a TarInfo object from a 512 byte bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001048 """
Lars Gustäbel9520a432009-11-22 18:48:49 +00001049 if len(buf) == 0:
1050 raise EmptyHeaderError("empty header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001051 if len(buf) != BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001052 raise TruncatedHeaderError("truncated header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001053 if buf.count(NUL) == BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001054 raise EOFHeaderError("end of file header")
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001055
1056 chksum = nti(buf[148:156])
1057 if chksum not in calc_chksums(buf):
Lars Gustäbel9520a432009-11-22 18:48:49 +00001058 raise InvalidHeaderError("bad checksum")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001059
Guido van Rossumd8faa362007-04-27 19:54:29 +00001060 obj = cls()
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001061 obj.name = nts(buf[0:100], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001062 obj.mode = nti(buf[100:108])
1063 obj.uid = nti(buf[108:116])
1064 obj.gid = nti(buf[116:124])
1065 obj.size = nti(buf[124:136])
1066 obj.mtime = nti(buf[136:148])
1067 obj.chksum = chksum
1068 obj.type = buf[156:157]
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001069 obj.linkname = nts(buf[157:257], encoding, errors)
1070 obj.uname = nts(buf[265:297], encoding, errors)
1071 obj.gname = nts(buf[297:329], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001072 obj.devmajor = nti(buf[329:337])
1073 obj.devminor = nti(buf[337:345])
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001074 prefix = nts(buf[345:500], encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001075
Guido van Rossumd8faa362007-04-27 19:54:29 +00001076 # Old V7 tar format represents a directory as a regular
1077 # file with a trailing slash.
1078 if obj.type == AREGTYPE and obj.name.endswith("/"):
1079 obj.type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001080
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001081 # The old GNU sparse format occupies some of the unused
1082 # space in the buffer for up to 4 sparse structures.
Mike53f7a7c2017-12-14 14:04:53 +03001083 # Save them for later processing in _proc_sparse().
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001084 if obj.type == GNUTYPE_SPARSE:
1085 pos = 386
1086 structs = []
1087 for i in range(4):
1088 try:
1089 offset = nti(buf[pos:pos + 12])
1090 numbytes = nti(buf[pos + 12:pos + 24])
1091 except ValueError:
1092 break
1093 structs.append((offset, numbytes))
1094 pos += 24
1095 isextended = bool(buf[482])
1096 origsize = nti(buf[483:495])
1097 obj._sparse_structs = (structs, isextended, origsize)
1098
Guido van Rossumd8faa362007-04-27 19:54:29 +00001099 # Remove redundant slashes from directories.
1100 if obj.isdir():
1101 obj.name = obj.name.rstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001102
Guido van Rossumd8faa362007-04-27 19:54:29 +00001103 # Reconstruct a ustar longname.
1104 if prefix and obj.type not in GNU_TYPES:
1105 obj.name = prefix + "/" + obj.name
1106 return obj
1107
1108 @classmethod
1109 def fromtarfile(cls, tarfile):
1110 """Return the next TarInfo object from TarFile object
1111 tarfile.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001112 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001113 buf = tarfile.fileobj.read(BLOCKSIZE)
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001114 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001115 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1116 return obj._proc_member(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001117
Guido van Rossumd8faa362007-04-27 19:54:29 +00001118 #--------------------------------------------------------------------------
1119 # The following are methods that are called depending on the type of a
1120 # member. The entry point is _proc_member() which can be overridden in a
1121 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1122 # implement the following
1123 # operations:
1124 # 1. Set self.offset_data to the position where the data blocks begin,
1125 # if there is data that follows.
1126 # 2. Set tarfile.offset to the position where the next member's header will
1127 # begin.
1128 # 3. Return self or another valid TarInfo object.
1129 def _proc_member(self, tarfile):
1130 """Choose the right processing method depending on
1131 the type and call it.
Thomas Wouters89f507f2006-12-13 04:49:30 +00001132 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001133 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1134 return self._proc_gnulong(tarfile)
1135 elif self.type == GNUTYPE_SPARSE:
1136 return self._proc_sparse(tarfile)
1137 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1138 return self._proc_pax(tarfile)
1139 else:
1140 return self._proc_builtin(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001141
Guido van Rossumd8faa362007-04-27 19:54:29 +00001142 def _proc_builtin(self, tarfile):
1143 """Process a builtin type or an unknown type which
1144 will be treated as a regular file.
1145 """
1146 self.offset_data = tarfile.fileobj.tell()
1147 offset = self.offset_data
1148 if self.isreg() or self.type not in SUPPORTED_TYPES:
1149 # Skip the following data blocks.
1150 offset += self._block(self.size)
1151 tarfile.offset = offset
Thomas Wouters89f507f2006-12-13 04:49:30 +00001152
Guido van Rossume7ba4952007-06-06 23:52:48 +00001153 # Patch the TarInfo object with saved global
Guido van Rossumd8faa362007-04-27 19:54:29 +00001154 # header information.
Guido van Rossume7ba4952007-06-06 23:52:48 +00001155 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001156
1157 return self
1158
1159 def _proc_gnulong(self, tarfile):
1160 """Process the blocks that hold a GNU longname
1161 or longlink member.
1162 """
1163 buf = tarfile.fileobj.read(self._block(self.size))
1164
1165 # Fetch the next header and process it.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001166 try:
1167 next = self.fromtarfile(tarfile)
Ethan Furmanb5a6db92020-12-12 13:26:44 -08001168 except HeaderError as e:
1169 raise SubsequentHeaderError(str(e)) from None
Guido van Rossumd8faa362007-04-27 19:54:29 +00001170
1171 # Patch the TarInfo object from the next header with
1172 # the longname information.
1173 next.offset = self.offset
1174 if self.type == GNUTYPE_LONGNAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001175 next.name = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001176 elif self.type == GNUTYPE_LONGLINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001177 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001178
1179 return next
1180
1181 def _proc_sparse(self, tarfile):
1182 """Process a GNU sparse header plus extra headers.
1183 """
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001184 # We already collected some sparse structures in frombuf().
1185 structs, isextended, origsize = self._sparse_structs
1186 del self._sparse_structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001187
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001188 # Collect sparse structures from extended header blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001189 while isextended:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001190 buf = tarfile.fileobj.read(BLOCKSIZE)
1191 pos = 0
Guido van Rossum805365e2007-05-07 22:24:25 +00001192 for i in range(21):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001193 try:
1194 offset = nti(buf[pos:pos + 12])
1195 numbytes = nti(buf[pos + 12:pos + 24])
1196 except ValueError:
1197 break
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001198 if offset and numbytes:
1199 structs.append((offset, numbytes))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001200 pos += 24
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001201 isextended = bool(buf[504])
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001202 self.sparse = structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001203
1204 self.offset_data = tarfile.fileobj.tell()
1205 tarfile.offset = self.offset_data + self._block(self.size)
1206 self.size = origsize
Guido van Rossumd8faa362007-04-27 19:54:29 +00001207 return self
1208
1209 def _proc_pax(self, tarfile):
1210 """Process an extended or global header as described in
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001211 POSIX.1-2008.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001212 """
1213 # Read the header information.
1214 buf = tarfile.fileobj.read(self._block(self.size))
1215
1216 # A pax header stores supplemental information for either
1217 # the following file (extended) or all following files
1218 # (global).
1219 if self.type == XGLTYPE:
1220 pax_headers = tarfile.pax_headers
1221 else:
1222 pax_headers = tarfile.pax_headers.copy()
1223
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001224 # Check if the pax header contains a hdrcharset field. This tells us
1225 # the encoding of the path, linkpath, uname and gname fields. Normally,
1226 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1227 # implementations are allowed to store them as raw binary strings if
1228 # the translation to UTF-8 fails.
1229 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1230 if match is not None:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001231 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001232
1233 # For the time being, we don't care about anything other than "BINARY".
1234 # The only other value that is currently allowed by the standard is
1235 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1236 hdrcharset = pax_headers.get("hdrcharset")
1237 if hdrcharset == "BINARY":
1238 encoding = tarfile.encoding
1239 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001240 encoding = "utf-8"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001241
Guido van Rossumd8faa362007-04-27 19:54:29 +00001242 # Parse pax header information. A record looks like that:
1243 # "%d %s=%s\n" % (length, keyword, value). length is the size
1244 # of the complete record including the length field itself and
Guido van Rossume7ba4952007-06-06 23:52:48 +00001245 # the newline. keyword and value are both UTF-8 encoded strings.
Antoine Pitroufd036452008-08-19 17:56:33 +00001246 regex = re.compile(br"(\d+) ([^=]+)=")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001247 pos = 0
1248 while True:
1249 match = regex.match(buf, pos)
1250 if not match:
1251 break
1252
1253 length, keyword = match.groups()
1254 length = int(length)
Rishi5a8d1212020-07-15 13:51:00 +02001255 if length == 0:
1256 raise InvalidHeaderError("invalid header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001257 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1258
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001259 # Normally, we could just use "utf-8" as the encoding and "strict"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001260 # as the error handler, but we better not take the risk. For
1261 # example, GNU tar <= 1.23 is known to store filenames it cannot
1262 # translate to UTF-8 as raw strings (unfortunately without a
1263 # hdrcharset=BINARY header).
1264 # We first try the strict standard encoding, and if that fails we
1265 # fall back on the user's encoding and error handler.
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001266 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001267 tarfile.errors)
1268 if keyword in PAX_NAME_FIELDS:
1269 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1270 tarfile.errors)
1271 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001272 value = self._decode_pax_field(value, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001273 tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001274
1275 pax_headers[keyword] = value
1276 pos += length
1277
Guido van Rossume7ba4952007-06-06 23:52:48 +00001278 # Fetch the next header.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001279 try:
1280 next = self.fromtarfile(tarfile)
Ethan Furmanb5a6db92020-12-12 13:26:44 -08001281 except HeaderError as e:
1282 raise SubsequentHeaderError(str(e)) from None
Guido van Rossumd8faa362007-04-27 19:54:29 +00001283
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001284 # Process GNU sparse information.
1285 if "GNU.sparse.map" in pax_headers:
1286 # GNU extended sparse format version 0.1.
1287 self._proc_gnusparse_01(next, pax_headers)
1288
1289 elif "GNU.sparse.size" in pax_headers:
1290 # GNU extended sparse format version 0.0.
1291 self._proc_gnusparse_00(next, pax_headers, buf)
1292
1293 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1294 # GNU extended sparse format version 1.0.
1295 self._proc_gnusparse_10(next, pax_headers, tarfile)
1296
Guido van Rossume7ba4952007-06-06 23:52:48 +00001297 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001298 # Patch the TarInfo object with the extended header info.
1299 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1300 next.offset = self.offset
1301
1302 if "size" in pax_headers:
1303 # If the extended header replaces the size field,
1304 # we need to recalculate the offset where the next
1305 # header starts.
1306 offset = next.offset_data
1307 if next.isreg() or next.type not in SUPPORTED_TYPES:
1308 offset += next._block(next.size)
1309 tarfile.offset = offset
1310
1311 return next
1312
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001313 def _proc_gnusparse_00(self, next, pax_headers, buf):
1314 """Process a GNU tar extended sparse header, version 0.0.
1315 """
1316 offsets = []
1317 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1318 offsets.append(int(match.group(1)))
1319 numbytes = []
1320 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1321 numbytes.append(int(match.group(1)))
1322 next.sparse = list(zip(offsets, numbytes))
1323
1324 def _proc_gnusparse_01(self, next, pax_headers):
1325 """Process a GNU tar extended sparse header, version 0.1.
1326 """
1327 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1328 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1329
1330 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1331 """Process a GNU tar extended sparse header, version 1.0.
1332 """
1333 fields = None
1334 sparse = []
1335 buf = tarfile.fileobj.read(BLOCKSIZE)
1336 fields, buf = buf.split(b"\n", 1)
1337 fields = int(fields)
1338 while len(sparse) < fields * 2:
1339 if b"\n" not in buf:
1340 buf += tarfile.fileobj.read(BLOCKSIZE)
1341 number, buf = buf.split(b"\n", 1)
1342 sparse.append(int(number))
1343 next.offset_data = tarfile.fileobj.tell()
1344 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1345
Guido van Rossume7ba4952007-06-06 23:52:48 +00001346 def _apply_pax_info(self, pax_headers, encoding, errors):
1347 """Replace fields with supplemental information from a previous
1348 pax extended or global header.
1349 """
1350 for keyword, value in pax_headers.items():
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001351 if keyword == "GNU.sparse.name":
1352 setattr(self, "path", value)
1353 elif keyword == "GNU.sparse.size":
1354 setattr(self, "size", int(value))
1355 elif keyword == "GNU.sparse.realsize":
1356 setattr(self, "size", int(value))
1357 elif keyword in PAX_FIELDS:
1358 if keyword in PAX_NUMBER_FIELDS:
1359 try:
1360 value = PAX_NUMBER_FIELDS[keyword](value)
1361 except ValueError:
1362 value = 0
1363 if keyword == "path":
1364 value = value.rstrip("/")
1365 setattr(self, keyword, value)
Guido van Rossume7ba4952007-06-06 23:52:48 +00001366
1367 self.pax_headers = pax_headers.copy()
Guido van Rossumd8faa362007-04-27 19:54:29 +00001368
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001369 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1370 """Decode a single field from a pax record.
1371 """
1372 try:
1373 return value.decode(encoding, "strict")
1374 except UnicodeDecodeError:
1375 return value.decode(fallback_encoding, fallback_errors)
1376
Guido van Rossumd8faa362007-04-27 19:54:29 +00001377 def _block(self, count):
1378 """Round up a byte count by BLOCKSIZE and return it,
1379 e.g. _block(834) => 1024.
1380 """
1381 blocks, remainder = divmod(count, BLOCKSIZE)
1382 if remainder:
1383 blocks += 1
1384 return blocks * BLOCKSIZE
Thomas Wouters89f507f2006-12-13 04:49:30 +00001385
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001386 def isreg(self):
Raymond Hettingera694f232019-03-27 13:16:34 -07001387 'Return True if the Tarinfo object is a regular file.'
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001388 return self.type in REGULAR_TYPES
Raymond Hettingera694f232019-03-27 13:16:34 -07001389
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001390 def isfile(self):
Raymond Hettingera694f232019-03-27 13:16:34 -07001391 'Return True if the Tarinfo object is a regular file.'
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001392 return self.isreg()
Raymond Hettingera694f232019-03-27 13:16:34 -07001393
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001394 def isdir(self):
Raymond Hettingera694f232019-03-27 13:16:34 -07001395 'Return True if it is a directory.'
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001396 return self.type == DIRTYPE
Raymond Hettingera694f232019-03-27 13:16:34 -07001397
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001398 def issym(self):
Raymond Hettingera694f232019-03-27 13:16:34 -07001399 'Return True if it is a symbolic link.'
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001400 return self.type == SYMTYPE
Raymond Hettingera694f232019-03-27 13:16:34 -07001401
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001402 def islnk(self):
Raymond Hettingera694f232019-03-27 13:16:34 -07001403 'Return True if it is a hard link.'
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001404 return self.type == LNKTYPE
Raymond Hettingera694f232019-03-27 13:16:34 -07001405
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001406 def ischr(self):
Raymond Hettingera694f232019-03-27 13:16:34 -07001407 'Return True if it is a character device.'
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001408 return self.type == CHRTYPE
Raymond Hettingera694f232019-03-27 13:16:34 -07001409
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001410 def isblk(self):
Raymond Hettingera694f232019-03-27 13:16:34 -07001411 'Return True if it is a block device.'
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001412 return self.type == BLKTYPE
Raymond Hettingera694f232019-03-27 13:16:34 -07001413
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001414 def isfifo(self):
Raymond Hettingera694f232019-03-27 13:16:34 -07001415 'Return True if it is a FIFO.'
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001416 return self.type == FIFOTYPE
Raymond Hettingera694f232019-03-27 13:16:34 -07001417
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001418 def issparse(self):
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001419 return self.sparse is not None
Raymond Hettingera694f232019-03-27 13:16:34 -07001420
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001421 def isdev(self):
Raymond Hettingera694f232019-03-27 13:16:34 -07001422 'Return True if it is one of character device, block device or FIFO.'
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001423 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1424# class TarInfo
1425
1426class TarFile(object):
1427 """The TarFile Class provides an interface to tar archives.
1428 """
1429
1430 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1431
1432 dereference = False # If true, add content of linked file to the
1433 # tar file, else the link.
1434
1435 ignore_zeros = False # If true, skips empty or invalid blocks and
1436 # continues processing.
1437
Lars Gustäbel365aff32009-12-13 11:42:29 +00001438 errorlevel = 1 # If 0, fatal errors only appear in debug
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001439 # messages (if debug >= 0). If > 0, errors
1440 # are passed to the caller as exceptions.
1441
Guido van Rossumd8faa362007-04-27 19:54:29 +00001442 format = DEFAULT_FORMAT # The format to use when creating an archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001443
Guido van Rossume7ba4952007-06-06 23:52:48 +00001444 encoding = ENCODING # Encoding for 8-bit character strings.
1445
1446 errors = None # Error handler for unicode conversion.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001447
Guido van Rossumd8faa362007-04-27 19:54:29 +00001448 tarinfo = TarInfo # The default TarInfo class to use.
1449
Lars Gustäbelb062a2f2012-05-14 13:18:16 +02001450 fileobject = ExFileObject # The file-object for extractfile().
Guido van Rossumd8faa362007-04-27 19:54:29 +00001451
1452 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1453 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
Łukasz Langa04bedfa2016-09-09 19:48:14 -07001454 errors="surrogateescape", pax_headers=None, debug=None,
1455 errorlevel=None, copybufsize=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001456 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1457 read from an existing archive, 'a' to append data to an existing
1458 file or 'w' to create a new file overwriting an existing one. `mode'
1459 defaults to 'r'.
1460 If `fileobj' is given, it is used for reading or writing data. If it
1461 can be determined, `mode' is overridden by `fileobj's mode.
1462 `fileobj' is not closed, when TarFile is closed.
1463 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001464 modes = {"r": "rb", "a": "r+b", "w": "wb", "x": "xb"}
Serhiy Storchaka53ad0cd2014-01-18 15:35:37 +02001465 if mode not in modes:
Berker Peksag0fe63252015-02-13 21:02:12 +02001466 raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001467 self.mode = mode
Serhiy Storchaka53ad0cd2014-01-18 15:35:37 +02001468 self._mode = modes[mode]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001469
1470 if not fileobj:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001471 if self.mode == "a" and not os.path.exists(name):
Thomas Wouterscf297e42007-02-23 15:07:44 +00001472 # Create nonexistent files in append mode.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001473 self.mode = "w"
1474 self._mode = "wb"
Guido van Rossume7ba4952007-06-06 23:52:48 +00001475 fileobj = bltn_open(name, self._mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001476 self._extfileobj = False
1477 else:
Serhiy Storchaka2c6a3ae2014-07-16 23:58:58 +03001478 if (name is None and hasattr(fileobj, "name") and
1479 isinstance(fileobj.name, (str, bytes))):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001480 name = fileobj.name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001481 if hasattr(fileobj, "mode"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001482 self._mode = fileobj.mode
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001483 self._extfileobj = True
Thomas Woutersed03b412007-08-28 21:37:11 +00001484 self.name = os.path.abspath(name) if name else None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001485 self.fileobj = fileobj
1486
Guido van Rossumd8faa362007-04-27 19:54:29 +00001487 # Init attributes.
1488 if format is not None:
1489 self.format = format
1490 if tarinfo is not None:
1491 self.tarinfo = tarinfo
1492 if dereference is not None:
1493 self.dereference = dereference
1494 if ignore_zeros is not None:
1495 self.ignore_zeros = ignore_zeros
1496 if encoding is not None:
1497 self.encoding = encoding
Victor Stinnerde629d42010-05-05 21:43:57 +00001498 self.errors = errors
Guido van Rossume7ba4952007-06-06 23:52:48 +00001499
1500 if pax_headers is not None and self.format == PAX_FORMAT:
1501 self.pax_headers = pax_headers
1502 else:
1503 self.pax_headers = {}
1504
Guido van Rossumd8faa362007-04-27 19:54:29 +00001505 if debug is not None:
1506 self.debug = debug
1507 if errorlevel is not None:
1508 self.errorlevel = errorlevel
1509
1510 # Init datastructures.
Łukasz Langa04bedfa2016-09-09 19:48:14 -07001511 self.copybufsize = copybufsize
Thomas Wouters477c8d52006-05-27 19:21:47 +00001512 self.closed = False
1513 self.members = [] # list of members as TarInfo objects
1514 self._loaded = False # flag if all members have been read
Christian Heimesd8654cf2007-12-02 15:22:16 +00001515 self.offset = self.fileobj.tell()
1516 # current position in the archive file
Thomas Wouters477c8d52006-05-27 19:21:47 +00001517 self.inodes = {} # dictionary caching the inodes of
1518 # archive members already added
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001519
Lars Gustäbel7b465392009-11-18 20:29:25 +00001520 try:
1521 if self.mode == "r":
1522 self.firstmember = None
1523 self.firstmember = self.next()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001524
Lars Gustäbel7b465392009-11-18 20:29:25 +00001525 if self.mode == "a":
1526 # Move to the end of the archive,
1527 # before the first empty block.
Lars Gustäbel7b465392009-11-18 20:29:25 +00001528 while True:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001529 self.fileobj.seek(self.offset)
1530 try:
1531 tarinfo = self.tarinfo.fromtarfile(self)
1532 self.members.append(tarinfo)
1533 except EOFHeaderError:
1534 self.fileobj.seek(self.offset)
Lars Gustäbel7b465392009-11-18 20:29:25 +00001535 break
Lars Gustäbel9520a432009-11-22 18:48:49 +00001536 except HeaderError as e:
Ethan Furmanb5a6db92020-12-12 13:26:44 -08001537 raise ReadError(str(e)) from None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001538
Lars Gustäbel20703c62015-05-27 12:53:44 +02001539 if self.mode in ("a", "w", "x"):
Lars Gustäbel7b465392009-11-18 20:29:25 +00001540 self._loaded = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001541
Lars Gustäbel7b465392009-11-18 20:29:25 +00001542 if self.pax_headers:
1543 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1544 self.fileobj.write(buf)
1545 self.offset += len(buf)
1546 except:
1547 if not self._extfileobj:
1548 self.fileobj.close()
1549 self.closed = True
1550 raise
Guido van Rossumd8faa362007-04-27 19:54:29 +00001551
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001552 #--------------------------------------------------------------------------
1553 # Below are the classmethods which act as alternate constructors to the
1554 # TarFile class. The open() method is the only one that is needed for
1555 # public use; it is the "super"-constructor and is able to select an
1556 # adequate "sub"-constructor for a particular compression using the mapping
1557 # from OPEN_METH.
1558 #
1559 # This concept allows one to subclass TarFile without losing the comfort of
1560 # the super-constructor. A sub-constructor is registered and made available
1561 # by adding it to the mapping in OPEN_METH.
1562
Guido van Rossum75b64e62005-01-16 00:16:11 +00001563 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001564 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001565 """Open a tar archive for reading, writing or appending. Return
1566 an appropriate TarFile class.
1567
1568 mode:
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001569 'r' or 'r:*' open for reading with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001570 'r:' open for reading exclusively uncompressed
1571 'r:gz' open for reading with gzip compression
1572 'r:bz2' open for reading with bzip2 compression
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001573 'r:xz' open for reading with lzma compression
Thomas Wouterscf297e42007-02-23 15:07:44 +00001574 'a' or 'a:' open for appending, creating the file if necessary
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001575 'w' or 'w:' open for writing without compression
1576 'w:gz' open for writing with gzip compression
1577 'w:bz2' open for writing with bzip2 compression
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001578 'w:xz' open for writing with lzma compression
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001579
Berker Peksag0fe63252015-02-13 21:02:12 +02001580 'x' or 'x:' create a tarfile exclusively without compression, raise
1581 an exception if the file is already created
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +03001582 'x:gz' create a gzip compressed tarfile, raise an exception
Berker Peksag0fe63252015-02-13 21:02:12 +02001583 if the file is already created
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +03001584 'x:bz2' create a bzip2 compressed tarfile, raise an exception
Berker Peksag0fe63252015-02-13 21:02:12 +02001585 if the file is already created
1586 'x:xz' create an lzma compressed tarfile, raise an exception
1587 if the file is already created
1588
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001589 'r|*' open a stream of tar blocks with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001590 'r|' open an uncompressed stream of tar blocks for reading
1591 'r|gz' open a gzip compressed stream of tar blocks
1592 'r|bz2' open a bzip2 compressed stream of tar blocks
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001593 'r|xz' open an lzma compressed stream of tar blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001594 'w|' open an uncompressed stream for writing
1595 'w|gz' open a gzip compressed stream for writing
1596 'w|bz2' open a bzip2 compressed stream for writing
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001597 'w|xz' open an lzma compressed stream for writing
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001598 """
1599
1600 if not name and not fileobj:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001601 raise ValueError("nothing to open")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001602
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001603 if mode in ("r", "r:*"):
1604 # Find out which *open() is appropriate for opening the file.
Serhiy Storchakaa89d22a2016-10-30 20:52:29 +02001605 def not_compressed(comptype):
1606 return cls.OPEN_METH[comptype] == 'taropen'
1607 for comptype in sorted(cls.OPEN_METH, key=not_compressed):
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001608 func = getattr(cls, cls.OPEN_METH[comptype])
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001609 if fileobj is not None:
1610 saved_pos = fileobj.tell()
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001611 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001612 return func(name, "r", fileobj, **kwargs)
Łukasz Langa04bedfa2016-09-09 19:48:14 -07001613 except (ReadError, CompressionError):
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001614 if fileobj is not None:
1615 fileobj.seek(saved_pos)
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001616 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001617 raise ReadError("file could not be opened successfully")
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001618
1619 elif ":" in mode:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001620 filemode, comptype = mode.split(":", 1)
1621 filemode = filemode or "r"
1622 comptype = comptype or "tar"
1623
1624 # Select the *open() function according to
1625 # given compression.
1626 if comptype in cls.OPEN_METH:
1627 func = getattr(cls, cls.OPEN_METH[comptype])
1628 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001629 raise CompressionError("unknown compression type %r" % comptype)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001630 return func(name, filemode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001631
1632 elif "|" in mode:
1633 filemode, comptype = mode.split("|", 1)
1634 filemode = filemode or "r"
1635 comptype = comptype or "tar"
1636
Serhiy Storchaka53ad0cd2014-01-18 15:35:37 +02001637 if filemode not in ("r", "w"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001638 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001639
Antoine Pitrou605c2932010-09-23 20:15:14 +00001640 stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1641 try:
1642 t = cls(name, filemode, stream, **kwargs)
1643 except:
1644 stream.close()
1645 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001646 t._extfileobj = False
1647 return t
1648
Berker Peksag0fe63252015-02-13 21:02:12 +02001649 elif mode in ("a", "w", "x"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001650 return cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001651
Thomas Wouters477c8d52006-05-27 19:21:47 +00001652 raise ValueError("undiscernible mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001653
Guido van Rossum75b64e62005-01-16 00:16:11 +00001654 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001655 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001656 """Open uncompressed tar archive name for reading or writing.
1657 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001658 if mode not in ("r", "a", "w", "x"):
1659 raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001660 return cls(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001661
Guido van Rossum75b64e62005-01-16 00:16:11 +00001662 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001663 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001664 """Open gzip compressed tar archive name for reading or writing.
1665 Appending is not allowed.
1666 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001667 if mode not in ("r", "w", "x"):
1668 raise ValueError("mode must be 'r', 'w' or 'x'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001669
1670 try:
Serhiy Storchaka9017e0b2020-01-24 19:55:52 +02001671 from gzip import GzipFile
1672 except ImportError:
Ethan Furmanb5a6db92020-12-12 13:26:44 -08001673 raise CompressionError("gzip module is not available") from None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001674
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001675 try:
Serhiy Storchaka9017e0b2020-01-24 19:55:52 +02001676 fileobj = GzipFile(name, mode + "b", compresslevel, fileobj)
Ethan Furmanb5a6db92020-12-12 13:26:44 -08001677 except OSError as e:
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001678 if fileobj is not None and mode == 'r':
Ethan Furmanb5a6db92020-12-12 13:26:44 -08001679 raise ReadError("not a gzip file") from e
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001680 raise
1681
1682 try:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001683 t = cls.taropen(name, mode, fileobj, **kwargs)
Ethan Furmanb5a6db92020-12-12 13:26:44 -08001684 except OSError as e:
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001685 fileobj.close()
1686 if mode == 'r':
Ethan Furmanb5a6db92020-12-12 13:26:44 -08001687 raise ReadError("not a gzip file") from e
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001688 raise
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001689 except:
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001690 fileobj.close()
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001691 raise
Serhiy Storchaka9fbec7a2014-01-18 15:53:05 +02001692 t._extfileobj = False
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001693 return t
1694
Guido van Rossum75b64e62005-01-16 00:16:11 +00001695 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001696 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001697 """Open bzip2 compressed tar archive name for reading or writing.
1698 Appending is not allowed.
1699 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001700 if mode not in ("r", "w", "x"):
1701 raise ValueError("mode must be 'r', 'w' or 'x'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001702
1703 try:
Serhiy Storchaka9017e0b2020-01-24 19:55:52 +02001704 from bz2 import BZ2File
Brett Cannoncd171c82013-07-04 17:43:24 -04001705 except ImportError:
Ethan Furmanb5a6db92020-12-12 13:26:44 -08001706 raise CompressionError("bz2 module is not available") from None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001707
Serhiy Storchaka9017e0b2020-01-24 19:55:52 +02001708 fileobj = BZ2File(fileobj or name, mode, compresslevel=compresslevel)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001709
1710 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001711 t = cls.taropen(name, mode, fileobj, **kwargs)
Ethan Furmanb5a6db92020-12-12 13:26:44 -08001712 except (OSError, EOFError) as e:
Antoine Pitrou95f55602010-09-23 18:36:46 +00001713 fileobj.close()
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001714 if mode == 'r':
Ethan Furmanb5a6db92020-12-12 13:26:44 -08001715 raise ReadError("not a bzip2 file") from e
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001716 raise
Serhiy Storchakae413cde2014-01-18 16:28:08 +02001717 except:
1718 fileobj.close()
1719 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001720 t._extfileobj = False
1721 return t
1722
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001723 @classmethod
Lars Gustäbelc5e11992012-01-18 14:01:17 +01001724 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001725 """Open lzma compressed tar archive name for reading or writing.
1726 Appending is not allowed.
1727 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001728 if mode not in ("r", "w", "x"):
1729 raise ValueError("mode must be 'r', 'w' or 'x'")
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001730
1731 try:
Serhiy Storchaka9017e0b2020-01-24 19:55:52 +02001732 from lzma import LZMAFile, LZMAError
Brett Cannoncd171c82013-07-04 17:43:24 -04001733 except ImportError:
Ethan Furmanb5a6db92020-12-12 13:26:44 -08001734 raise CompressionError("lzma module is not available") from None
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001735
Serhiy Storchaka9017e0b2020-01-24 19:55:52 +02001736 fileobj = LZMAFile(fileobj or name, mode, preset=preset)
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001737
1738 try:
1739 t = cls.taropen(name, mode, fileobj, **kwargs)
Ethan Furmanb5a6db92020-12-12 13:26:44 -08001740 except (LZMAError, EOFError) as e:
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001741 fileobj.close()
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001742 if mode == 'r':
Ethan Furmanb5a6db92020-12-12 13:26:44 -08001743 raise ReadError("not an lzma file") from e
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001744 raise
Serhiy Storchakae413cde2014-01-18 16:28:08 +02001745 except:
1746 fileobj.close()
1747 raise
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001748 t._extfileobj = False
1749 return t
1750
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001751 # All *open() methods are registered here.
1752 OPEN_METH = {
1753 "tar": "taropen", # uncompressed tar
1754 "gz": "gzopen", # gzip compressed tar
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001755 "bz2": "bz2open", # bzip2 compressed tar
1756 "xz": "xzopen" # lzma compressed tar
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001757 }
1758
1759 #--------------------------------------------------------------------------
1760 # The public methods which TarFile provides:
1761
1762 def close(self):
1763 """Close the TarFile. In write-mode, two finishing zero blocks are
1764 appended to the archive.
1765 """
1766 if self.closed:
1767 return
1768
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001769 self.closed = True
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +03001770 try:
Lars Gustäbel20703c62015-05-27 12:53:44 +02001771 if self.mode in ("a", "w", "x"):
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +03001772 self.fileobj.write(NUL * (BLOCKSIZE * 2))
1773 self.offset += (BLOCKSIZE * 2)
1774 # fill up the end with zero-blocks
1775 # (like option -b20 for tar does)
1776 blocks, remainder = divmod(self.offset, RECORDSIZE)
1777 if remainder > 0:
1778 self.fileobj.write(NUL * (RECORDSIZE - remainder))
1779 finally:
1780 if not self._extfileobj:
1781 self.fileobj.close()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001782
1783 def getmember(self, name):
1784 """Return a TarInfo object for member `name'. If `name' can not be
1785 found in the archive, KeyError is raised. If a member occurs more
Mark Dickinson934896d2009-02-21 20:59:32 +00001786 than once in the archive, its last occurrence is assumed to be the
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001787 most up-to-date version.
1788 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001789 tarinfo = self._getmember(name)
1790 if tarinfo is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001791 raise KeyError("filename %r not found" % name)
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001792 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001793
1794 def getmembers(self):
1795 """Return the members of the archive as a list of TarInfo objects. The
1796 list has the same order as the members in the archive.
1797 """
1798 self._check()
1799 if not self._loaded: # if we want to obtain a list of
1800 self._load() # all members, we first have to
1801 # scan the whole archive.
1802 return self.members
1803
1804 def getnames(self):
1805 """Return the members of the archive as a list of their names. It has
1806 the same order as the list returned by getmembers().
1807 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001808 return [tarinfo.name for tarinfo in self.getmembers()]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001809
1810 def gettarinfo(self, name=None, arcname=None, fileobj=None):
Martin Panterf817a482016-02-19 23:34:56 +00001811 """Create a TarInfo object from the result of os.stat or equivalent
1812 on an existing file. The file is either named by `name', or
1813 specified as a file object `fileobj' with a file descriptor. If
1814 given, `arcname' specifies an alternative name for the file in the
1815 archive, otherwise, the name is taken from the 'name' attribute of
1816 'fileobj', or the 'name' argument. The name should be a text
1817 string.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001818 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001819 self._check("awx")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001820
1821 # When fileobj is given, replace name by
1822 # fileobj's real name.
1823 if fileobj is not None:
1824 name = fileobj.name
1825
1826 # Building the name of the member in the archive.
1827 # Backward slashes are converted to forward slashes,
1828 # Absolute paths are turned to relative paths.
1829 if arcname is None:
1830 arcname = name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001831 drv, arcname = os.path.splitdrive(arcname)
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00001832 arcname = arcname.replace(os.sep, "/")
1833 arcname = arcname.lstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001834
1835 # Now, fill the TarInfo object with
1836 # information specific for the file.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001837 tarinfo = self.tarinfo()
Martin Panterf817a482016-02-19 23:34:56 +00001838 tarinfo.tarfile = self # Not needed
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001839
Anthony Sottile8377cd42019-02-25 14:32:27 -08001840 # Use os.stat or os.lstat, depending on if symlinks shall be resolved.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001841 if fileobj is None:
Anthony Sottile8377cd42019-02-25 14:32:27 -08001842 if not self.dereference:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001843 statres = os.lstat(name)
1844 else:
1845 statres = os.stat(name)
1846 else:
1847 statres = os.fstat(fileobj.fileno())
1848 linkname = ""
1849
1850 stmd = statres.st_mode
1851 if stat.S_ISREG(stmd):
1852 inode = (statres.st_ino, statres.st_dev)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001853 if not self.dereference and statres.st_nlink > 1 and \
1854 inode in self.inodes and arcname != self.inodes[inode]:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001855 # Is it a hardlink to an already
1856 # archived file?
1857 type = LNKTYPE
1858 linkname = self.inodes[inode]
1859 else:
1860 # The inode is added only if its valid.
1861 # For win32 it is always 0.
1862 type = REGTYPE
1863 if inode[0]:
1864 self.inodes[inode] = arcname
1865 elif stat.S_ISDIR(stmd):
1866 type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001867 elif stat.S_ISFIFO(stmd):
1868 type = FIFOTYPE
1869 elif stat.S_ISLNK(stmd):
1870 type = SYMTYPE
1871 linkname = os.readlink(name)
1872 elif stat.S_ISCHR(stmd):
1873 type = CHRTYPE
1874 elif stat.S_ISBLK(stmd):
1875 type = BLKTYPE
1876 else:
1877 return None
1878
1879 # Fill the TarInfo object with all
1880 # information we can get.
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001881 tarinfo.name = arcname
1882 tarinfo.mode = stmd
1883 tarinfo.uid = statres.st_uid
1884 tarinfo.gid = statres.st_gid
Lars Gustäbel2470ff12010-06-03 10:11:52 +00001885 if type == REGTYPE:
Martin v. Löwis61d77e02004-08-20 06:35:46 +00001886 tarinfo.size = statres.st_size
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001887 else:
Guido van Rossume2a383d2007-01-15 16:59:06 +00001888 tarinfo.size = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001889 tarinfo.mtime = statres.st_mtime
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001890 tarinfo.type = type
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001891 tarinfo.linkname = linkname
1892 if pwd:
1893 try:
1894 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1895 except KeyError:
1896 pass
1897 if grp:
1898 try:
1899 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1900 except KeyError:
1901 pass
1902
1903 if type in (CHRTYPE, BLKTYPE):
1904 if hasattr(os, "major") and hasattr(os, "minor"):
1905 tarinfo.devmajor = os.major(statres.st_rdev)
1906 tarinfo.devminor = os.minor(statres.st_rdev)
1907 return tarinfo
1908
Serhiy Storchakaa7eb7462014-08-21 10:01:16 +03001909 def list(self, verbose=True, *, members=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001910 """Print a table of contents to sys.stdout. If `verbose' is False, only
1911 the names of the members are printed. If it is True, an `ls -l'-like
Serhiy Storchakaa7eb7462014-08-21 10:01:16 +03001912 output is produced. `members' is optional and must be a subset of the
1913 list returned by getmembers().
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001914 """
1915 self._check()
1916
Serhiy Storchakaa7eb7462014-08-21 10:01:16 +03001917 if members is None:
1918 members = self
1919 for tarinfo in members:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001920 if verbose:
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001921 _safe_print(stat.filemode(tarinfo.mode))
1922 _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid,
1923 tarinfo.gname or tarinfo.gid))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001924 if tarinfo.ischr() or tarinfo.isblk():
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001925 _safe_print("%10s" %
1926 ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor)))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001927 else:
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001928 _safe_print("%10d" % tarinfo.size)
1929 _safe_print("%d-%02d-%02d %02d:%02d:%02d" \
1930 % time.localtime(tarinfo.mtime)[:6])
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001931
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001932 _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else ""))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001933
1934 if verbose:
1935 if tarinfo.issym():
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001936 _safe_print("-> " + tarinfo.linkname)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001937 if tarinfo.islnk():
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001938 _safe_print("link to " + tarinfo.linkname)
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001939 print()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001940
Serhiy Storchaka4f76fb12017-01-13 13:25:24 +02001941 def add(self, name, arcname=None, recursive=True, *, filter=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001942 """Add the file `name' to the archive. `name' may be any type of file
1943 (directory, fifo, symbolic link, etc.). If given, `arcname'
1944 specifies an alternative name for the file in the archive.
1945 Directories are added recursively by default. This can be avoided by
Serhiy Storchaka4f76fb12017-01-13 13:25:24 +02001946 setting `recursive' to False. `filter' is a function
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001947 that expects a TarInfo object argument and returns the changed
1948 TarInfo object, if it returns None the TarInfo object will be
1949 excluded from the archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001950 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001951 self._check("awx")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001952
1953 if arcname is None:
1954 arcname = name
1955
1956 # Skip if somebody tries to archive the archive...
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001957 if self.name is not None and os.path.abspath(name) == self.name:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001958 self._dbg(2, "tarfile: Skipped %r" % name)
1959 return
1960
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001961 self._dbg(1, name)
1962
1963 # Create a TarInfo object from the file.
1964 tarinfo = self.gettarinfo(name, arcname)
1965
1966 if tarinfo is None:
1967 self._dbg(1, "tarfile: Unsupported type %r" % name)
1968 return
1969
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001970 # Change or exclude the TarInfo object.
1971 if filter is not None:
1972 tarinfo = filter(tarinfo)
1973 if tarinfo is None:
1974 self._dbg(2, "tarfile: Excluded %r" % name)
1975 return
1976
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001977 # Append the tar header and data to the archive.
1978 if tarinfo.isreg():
Andrew Svetlov718df1d2012-11-29 14:20:47 +02001979 with bltn_open(name, "rb") as f:
1980 self.addfile(tarinfo, f)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001981
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001982 elif tarinfo.isdir():
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001983 self.addfile(tarinfo)
1984 if recursive:
Bernhard M. Wiedemann84521042018-01-31 11:17:10 +01001985 for f in sorted(os.listdir(name)):
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001986 self.add(os.path.join(name, f), os.path.join(arcname, f),
Serhiy Storchaka4f76fb12017-01-13 13:25:24 +02001987 recursive, filter=filter)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001988
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001989 else:
1990 self.addfile(tarinfo)
1991
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001992 def addfile(self, tarinfo, fileobj=None):
1993 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
Martin Panterf817a482016-02-19 23:34:56 +00001994 given, it should be a binary file, and tarinfo.size bytes are read
1995 from it and added to the archive. You can create TarInfo objects
1996 directly, or by using gettarinfo().
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001997 """
Berker Peksag0fe63252015-02-13 21:02:12 +02001998 self._check("awx")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001999
Thomas Wouters89f507f2006-12-13 04:49:30 +00002000 tarinfo = copy.copy(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002001
Guido van Rossume7ba4952007-06-06 23:52:48 +00002002 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002003 self.fileobj.write(buf)
2004 self.offset += len(buf)
Łukasz Langa04bedfa2016-09-09 19:48:14 -07002005 bufsize=self.copybufsize
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002006 # If there's data to follow, append it.
2007 if fileobj is not None:
Łukasz Langa04bedfa2016-09-09 19:48:14 -07002008 copyfileobj(fileobj, self.fileobj, tarinfo.size, bufsize=bufsize)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002009 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
2010 if remainder > 0:
2011 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2012 blocks += 1
2013 self.offset += blocks * BLOCKSIZE
2014
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002015 self.members.append(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002016
Eric V. Smith7a803892015-04-15 10:27:58 -04002017 def extractall(self, path=".", members=None, *, numeric_owner=False):
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002018 """Extract all members from the archive to the current working
2019 directory and set owner, modification time and permissions on
2020 directories afterwards. `path' specifies a different directory
2021 to extract to. `members' is optional and must be a subset of the
Eric V. Smith7a803892015-04-15 10:27:58 -04002022 list returned by getmembers(). If `numeric_owner` is True, only
2023 the numbers for user/group names are used and not the names.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002024 """
2025 directories = []
2026
2027 if members is None:
2028 members = self
2029
2030 for tarinfo in members:
2031 if tarinfo.isdir():
Christian Heimes2202f872008-02-06 14:31:34 +00002032 # Extract directories with a safe mode.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002033 directories.append(tarinfo)
Christian Heimes2202f872008-02-06 14:31:34 +00002034 tarinfo = copy.copy(tarinfo)
2035 tarinfo.mode = 0o700
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002036 # Do not set_attrs directories, as we will do that further down
Eric V. Smith7a803892015-04-15 10:27:58 -04002037 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(),
2038 numeric_owner=numeric_owner)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002039
2040 # Reverse sort directories.
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +00002041 directories.sort(key=lambda a: a.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002042 directories.reverse()
2043
2044 # Set correct owner, mtime and filemode on directories.
2045 for tarinfo in directories:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002046 dirpath = os.path.join(path, tarinfo.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002047 try:
Eric V. Smith7a803892015-04-15 10:27:58 -04002048 self.chown(tarinfo, dirpath, numeric_owner=numeric_owner)
Christian Heimesfaf2f632008-01-06 16:59:19 +00002049 self.utime(tarinfo, dirpath)
2050 self.chmod(tarinfo, dirpath)
Guido van Rossumb940e112007-01-10 16:19:56 +00002051 except ExtractError as e:
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002052 if self.errorlevel > 1:
2053 raise
2054 else:
2055 self._dbg(1, "tarfile: %s" % e)
2056
Eric V. Smith7a803892015-04-15 10:27:58 -04002057 def extract(self, member, path="", set_attrs=True, *, numeric_owner=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002058 """Extract a member from the archive to the current working directory,
2059 using its full name. Its file information is extracted as accurately
2060 as possible. `member' may be a filename or a TarInfo object. You can
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002061 specify a different directory using `path'. File attributes (owner,
Eric V. Smith7a803892015-04-15 10:27:58 -04002062 mtime, mode) are set unless `set_attrs' is False. If `numeric_owner`
2063 is True, only the numbers for user/group names are used and not
2064 the names.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002065 """
2066 self._check("r")
2067
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002068 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002069 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002070 else:
2071 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002072
Neal Norwitza4f651a2004-07-20 22:07:44 +00002073 # Prepare the link target for makelink().
2074 if tarinfo.islnk():
2075 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2076
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002077 try:
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002078 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
Eric V. Smith7a803892015-04-15 10:27:58 -04002079 set_attrs=set_attrs,
2080 numeric_owner=numeric_owner)
Andrew Svetlov3438fa42012-12-17 23:35:18 +02002081 except OSError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002082 if self.errorlevel > 0:
2083 raise
2084 else:
2085 if e.filename is None:
2086 self._dbg(1, "tarfile: %s" % e.strerror)
2087 else:
2088 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
Guido van Rossumb940e112007-01-10 16:19:56 +00002089 except ExtractError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002090 if self.errorlevel > 1:
2091 raise
2092 else:
2093 self._dbg(1, "tarfile: %s" % e)
2094
2095 def extractfile(self, member):
2096 """Extract a member from the archive as a file object. `member' may be
Andrey Doroschenkoec427892020-10-20 17:05:01 +03002097 a filename or a TarInfo object. If `member' is a regular file or
2098 a link, an io.BufferedReader object is returned. For all other
2099 existing members, None is returned. If `member' does not appear
2100 in the archive, KeyError is raised.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002101 """
2102 self._check("r")
2103
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002104 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002105 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002106 else:
2107 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002108
Lars Gustäbel7a919e92012-05-05 18:15:03 +02002109 if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES:
2110 # Members with unknown types are treated as regular files.
Lars Gustäbelb062a2f2012-05-14 13:18:16 +02002111 return self.fileobject(self, tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002112
2113 elif tarinfo.islnk() or tarinfo.issym():
2114 if isinstance(self.fileobj, _Stream):
2115 # A small but ugly workaround for the case that someone tries
2116 # to extract a (sym)link as a file-object from a non-seekable
2117 # stream of tar blocks.
Thomas Wouters477c8d52006-05-27 19:21:47 +00002118 raise StreamError("cannot extract (sym)link as file object")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002119 else:
Georg Brandl7eb4b7d2005-07-22 21:49:32 +00002120 # A (sym)link's file object is its target's file object.
Lars Gustäbel1b512722010-06-03 12:45:16 +00002121 return self.extractfile(self._find_link_target(tarinfo))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002122 else:
2123 # If there's no data associated with the member (directory, chrdev,
2124 # blkdev, etc.), return None instead of a file object.
2125 return None
2126
Eric V. Smith7a803892015-04-15 10:27:58 -04002127 def _extract_member(self, tarinfo, targetpath, set_attrs=True,
2128 numeric_owner=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002129 """Extract the TarInfo object tarinfo to a physical
2130 file called targetpath.
2131 """
2132 # Fetch the TarInfo object for the given name
2133 # and build the destination pathname, replacing
2134 # forward slashes to platform specific separators.
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002135 targetpath = targetpath.rstrip("/")
2136 targetpath = targetpath.replace("/", os.sep)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002137
2138 # Create all upper directories.
2139 upperdirs = os.path.dirname(targetpath)
2140 if upperdirs and not os.path.exists(upperdirs):
Christian Heimes2202f872008-02-06 14:31:34 +00002141 # Create directories that are not part of the archive with
2142 # default permissions.
Thomas Woutersb2137042007-02-01 18:02:27 +00002143 os.makedirs(upperdirs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002144
2145 if tarinfo.islnk() or tarinfo.issym():
2146 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2147 else:
2148 self._dbg(1, tarinfo.name)
2149
2150 if tarinfo.isreg():
2151 self.makefile(tarinfo, targetpath)
2152 elif tarinfo.isdir():
2153 self.makedir(tarinfo, targetpath)
2154 elif tarinfo.isfifo():
2155 self.makefifo(tarinfo, targetpath)
2156 elif tarinfo.ischr() or tarinfo.isblk():
2157 self.makedev(tarinfo, targetpath)
2158 elif tarinfo.islnk() or tarinfo.issym():
2159 self.makelink(tarinfo, targetpath)
2160 elif tarinfo.type not in SUPPORTED_TYPES:
2161 self.makeunknown(tarinfo, targetpath)
2162 else:
2163 self.makefile(tarinfo, targetpath)
2164
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002165 if set_attrs:
Eric V. Smith7a803892015-04-15 10:27:58 -04002166 self.chown(tarinfo, targetpath, numeric_owner)
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002167 if not tarinfo.issym():
2168 self.chmod(tarinfo, targetpath)
2169 self.utime(tarinfo, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002170
2171 #--------------------------------------------------------------------------
2172 # Below are the different file methods. They are called via
2173 # _extract_member() when extract() is called. They can be replaced in a
2174 # subclass to implement other functionality.
2175
2176 def makedir(self, tarinfo, targetpath):
2177 """Make a directory called targetpath.
2178 """
2179 try:
Christian Heimes2202f872008-02-06 14:31:34 +00002180 # Use a safe mode for the directory, the real mode is set
2181 # later in _extract_member().
2182 os.mkdir(targetpath, 0o700)
Florent Xicluna68f71a32011-10-28 16:06:23 +02002183 except FileExistsError:
2184 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002185
2186 def makefile(self, tarinfo, targetpath):
2187 """Make a file called targetpath.
2188 """
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002189 source = self.fileobj
2190 source.seek(tarinfo.offset_data)
Łukasz Langa04bedfa2016-09-09 19:48:14 -07002191 bufsize = self.copybufsize
Andrew Svetlov718df1d2012-11-29 14:20:47 +02002192 with bltn_open(targetpath, "wb") as target:
2193 if tarinfo.sparse is not None:
2194 for offset, size in tarinfo.sparse:
2195 target.seek(offset)
Łukasz Langa04bedfa2016-09-09 19:48:14 -07002196 copyfileobj(source, target, size, ReadError, bufsize)
Łukasz Langae7f27482016-06-11 16:42:36 -07002197 target.seek(tarinfo.size)
2198 target.truncate()
Andrew Svetlov718df1d2012-11-29 14:20:47 +02002199 else:
Łukasz Langa04bedfa2016-09-09 19:48:14 -07002200 copyfileobj(source, target, tarinfo.size, ReadError, bufsize)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002201
2202 def makeunknown(self, tarinfo, targetpath):
2203 """Make a file from a TarInfo object with an unknown type
2204 at targetpath.
2205 """
2206 self.makefile(tarinfo, targetpath)
2207 self._dbg(1, "tarfile: Unknown file type %r, " \
2208 "extracted as regular file." % tarinfo.type)
2209
2210 def makefifo(self, tarinfo, targetpath):
2211 """Make a fifo called targetpath.
2212 """
2213 if hasattr(os, "mkfifo"):
2214 os.mkfifo(targetpath)
2215 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002216 raise ExtractError("fifo not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002217
2218 def makedev(self, tarinfo, targetpath):
2219 """Make a character or block device called targetpath.
2220 """
2221 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00002222 raise ExtractError("special devices not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002223
2224 mode = tarinfo.mode
2225 if tarinfo.isblk():
2226 mode |= stat.S_IFBLK
2227 else:
2228 mode |= stat.S_IFCHR
2229
2230 os.mknod(targetpath, mode,
2231 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2232
2233 def makelink(self, tarinfo, targetpath):
2234 """Make a (symbolic) link called targetpath. If it cannot be created
2235 (platform limitation), we try to make a copy of the referenced file
2236 instead of a link.
2237 """
Brian Curtind40e6f72010-07-08 21:39:08 +00002238 try:
Lars Gustäbel1b512722010-06-03 12:45:16 +00002239 # For systems that support symbolic and hard links.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002240 if tarinfo.issym():
Julien Palard4fedd712020-11-25 10:23:17 +01002241 if os.path.lexists(targetpath):
2242 # Avoid FileExistsError on following os.symlink.
2243 os.unlink(targetpath)
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002244 os.symlink(tarinfo.linkname, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002245 else:
Neal Norwitza4f651a2004-07-20 22:07:44 +00002246 # See extract().
Lars Gustäbel1b512722010-06-03 12:45:16 +00002247 if os.path.exists(tarinfo._link_target):
2248 os.link(tarinfo._link_target, targetpath)
2249 else:
Brian Curtin82df53e2010-09-24 21:04:05 +00002250 self._extract_member(self._find_link_target(tarinfo),
2251 targetpath)
Brian Curtin16633fa2010-07-09 13:54:27 +00002252 except symlink_exception:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002253 try:
Brian Curtin16633fa2010-07-09 13:54:27 +00002254 self._extract_member(self._find_link_target(tarinfo),
2255 targetpath)
Lars Gustäbel1b512722010-06-03 12:45:16 +00002256 except KeyError:
Ethan Furmanb5a6db92020-12-12 13:26:44 -08002257 raise ExtractError("unable to resolve link inside archive") from None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002258
Eric V. Smith7a803892015-04-15 10:27:58 -04002259 def chown(self, tarinfo, targetpath, numeric_owner):
2260 """Set owner of targetpath according to tarinfo. If numeric_owner
Xavier de Gayef44abda2016-12-09 09:33:09 +01002261 is True, use .gid/.uid instead of .gname/.uname. If numeric_owner
2262 is False, fall back to .gid/.uid when the search based on name
2263 fails.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002264 """
Xavier de Gayef44abda2016-12-09 09:33:09 +01002265 if hasattr(os, "geteuid") and os.geteuid() == 0:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002266 # We have to be root to do so.
Xavier de Gayef44abda2016-12-09 09:33:09 +01002267 g = tarinfo.gid
2268 u = tarinfo.uid
2269 if not numeric_owner:
Eric V. Smith7a803892015-04-15 10:27:58 -04002270 try:
Xavier de Gayef44abda2016-12-09 09:33:09 +01002271 if grp:
2272 g = grp.getgrnam(tarinfo.gname)[2]
Eric V. Smith7a803892015-04-15 10:27:58 -04002273 except KeyError:
Xavier de Gayef44abda2016-12-09 09:33:09 +01002274 pass
Eric V. Smith7a803892015-04-15 10:27:58 -04002275 try:
Xavier de Gayef44abda2016-12-09 09:33:09 +01002276 if pwd:
2277 u = pwd.getpwnam(tarinfo.uname)[2]
Eric V. Smith7a803892015-04-15 10:27:58 -04002278 except KeyError:
Xavier de Gayef44abda2016-12-09 09:33:09 +01002279 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002280 try:
2281 if tarinfo.issym() and hasattr(os, "lchown"):
2282 os.lchown(targetpath, u, g)
2283 else:
Jesus Cea4791a242012-10-05 03:15:39 +02002284 os.chown(targetpath, u, g)
Ethan Furmanb5a6db92020-12-12 13:26:44 -08002285 except OSError as e:
2286 raise ExtractError("could not change owner") from e
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002287
2288 def chmod(self, tarinfo, targetpath):
2289 """Set file permissions of targetpath according to tarinfo.
2290 """
Anthony Sottile8377cd42019-02-25 14:32:27 -08002291 try:
2292 os.chmod(targetpath, tarinfo.mode)
Ethan Furmanb5a6db92020-12-12 13:26:44 -08002293 except OSError as e:
2294 raise ExtractError("could not change mode") from e
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002295
2296 def utime(self, tarinfo, targetpath):
2297 """Set modification time of targetpath according to tarinfo.
2298 """
Jack Jansen834eff62003-03-07 12:47:06 +00002299 if not hasattr(os, 'utime'):
Tim Petersf9347782003-03-07 15:36:41 +00002300 return
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002301 try:
2302 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
Ethan Furmanb5a6db92020-12-12 13:26:44 -08002303 except OSError as e:
2304 raise ExtractError("could not change modification time") from e
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002305
2306 #--------------------------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002307 def next(self):
2308 """Return the next member of the archive as a TarInfo object, when
2309 TarFile is opened for reading. Return None if there is no more
2310 available.
2311 """
2312 self._check("ra")
2313 if self.firstmember is not None:
2314 m = self.firstmember
2315 self.firstmember = None
2316 return m
2317
Lars Gustäbel03572682015-07-06 09:27:24 +02002318 # Advance the file pointer.
2319 if self.offset != self.fileobj.tell():
2320 self.fileobj.seek(self.offset - 1)
2321 if not self.fileobj.read(1):
2322 raise ReadError("unexpected end of data")
2323
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002324 # Read the next block.
Lars Gustäbel9520a432009-11-22 18:48:49 +00002325 tarinfo = None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002326 while True:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002327 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00002328 tarinfo = self.tarinfo.fromtarfile(self)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002329 except EOFHeaderError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002330 if self.ignore_zeros:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002331 self._dbg(2, "0x%X: %s" % (self.offset, e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002332 self.offset += BLOCKSIZE
2333 continue
Lars Gustäbel9520a432009-11-22 18:48:49 +00002334 except InvalidHeaderError as e:
2335 if self.ignore_zeros:
2336 self._dbg(2, "0x%X: %s" % (self.offset, e))
2337 self.offset += BLOCKSIZE
2338 continue
2339 elif self.offset == 0:
Ethan Furmanb5a6db92020-12-12 13:26:44 -08002340 raise ReadError(str(e)) from None
Lars Gustäbel9520a432009-11-22 18:48:49 +00002341 except EmptyHeaderError:
2342 if self.offset == 0:
Ethan Furmanb5a6db92020-12-12 13:26:44 -08002343 raise ReadError("empty file") from None
Lars Gustäbel9520a432009-11-22 18:48:49 +00002344 except TruncatedHeaderError as e:
2345 if self.offset == 0:
Ethan Furmanb5a6db92020-12-12 13:26:44 -08002346 raise ReadError(str(e)) from None
Lars Gustäbel9520a432009-11-22 18:48:49 +00002347 except SubsequentHeaderError as e:
Ethan Furmanb5a6db92020-12-12 13:26:44 -08002348 raise ReadError(str(e)) from None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002349 break
2350
Lars Gustäbel9520a432009-11-22 18:48:49 +00002351 if tarinfo is not None:
2352 self.members.append(tarinfo)
2353 else:
2354 self._loaded = True
2355
Thomas Wouters477c8d52006-05-27 19:21:47 +00002356 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002357
2358 #--------------------------------------------------------------------------
2359 # Little helper methods:
2360
Lars Gustäbel1b512722010-06-03 12:45:16 +00002361 def _getmember(self, name, tarinfo=None, normalize=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002362 """Find an archive member by name from bottom to top.
2363 If tarinfo is given, it is used as the starting point.
2364 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002365 # Ensure that all members have been loaded.
2366 members = self.getmembers()
2367
Lars Gustäbel1b512722010-06-03 12:45:16 +00002368 # Limit the member search list up to tarinfo.
2369 if tarinfo is not None:
2370 members = members[:members.index(tarinfo)]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002371
Lars Gustäbel1b512722010-06-03 12:45:16 +00002372 if normalize:
2373 name = os.path.normpath(name)
2374
2375 for member in reversed(members):
2376 if normalize:
2377 member_name = os.path.normpath(member.name)
2378 else:
2379 member_name = member.name
2380
2381 if name == member_name:
2382 return member
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002383
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002384 def _load(self):
2385 """Read through the entire archive file and look for readable
2386 members.
2387 """
2388 while True:
2389 tarinfo = self.next()
2390 if tarinfo is None:
2391 break
2392 self._loaded = True
2393
2394 def _check(self, mode=None):
2395 """Check if TarFile is still open, and if the operation's mode
2396 corresponds to TarFile's mode.
2397 """
2398 if self.closed:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02002399 raise OSError("%s is closed" % self.__class__.__name__)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002400 if mode is not None and self.mode not in mode:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02002401 raise OSError("bad operation for mode %r" % self.mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002402
Lars Gustäbel1b512722010-06-03 12:45:16 +00002403 def _find_link_target(self, tarinfo):
2404 """Find the target member of a symlink or hardlink member in the
2405 archive.
2406 """
2407 if tarinfo.issym():
2408 # Always search the entire archive.
Lars Gustäbel1ef9eda2012-04-24 21:04:40 +02002409 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
Lars Gustäbel1b512722010-06-03 12:45:16 +00002410 limit = None
2411 else:
2412 # Search the archive before the link, because a hard link is
2413 # just a reference to an already archived file.
2414 linkname = tarinfo.linkname
2415 limit = tarinfo
2416
2417 member = self._getmember(linkname, tarinfo=limit, normalize=True)
2418 if member is None:
2419 raise KeyError("linkname %r not found" % linkname)
2420 return member
2421
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002422 def __iter__(self):
2423 """Provide an iterator object.
2424 """
2425 if self._loaded:
Serhiy Storchakaa2549212015-12-19 09:43:14 +02002426 yield from self.members
2427 return
2428
2429 # Yield items using TarFile's next() method.
2430 # When all members have been read, set TarFile as _loaded.
2431 index = 0
2432 # Fix for SF #1100429: Under rare circumstances it can
2433 # happen that getmembers() is called during iteration,
2434 # which will have already exhausted the next() method.
2435 if self.firstmember is not None:
2436 tarinfo = self.next()
2437 index += 1
2438 yield tarinfo
2439
2440 while True:
2441 if index < len(self.members):
2442 tarinfo = self.members[index]
2443 elif not self._loaded:
2444 tarinfo = self.next()
2445 if not tarinfo:
2446 self._loaded = True
2447 return
2448 else:
2449 return
2450 index += 1
2451 yield tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002452
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002453 def _dbg(self, level, msg):
2454 """Write debugging output to sys.stderr.
2455 """
2456 if level <= self.debug:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002457 print(msg, file=sys.stderr)
Lars Gustäbel01385812010-03-03 12:08:54 +00002458
2459 def __enter__(self):
2460 self._check()
2461 return self
2462
2463 def __exit__(self, type, value, traceback):
2464 if type is None:
2465 self.close()
2466 else:
2467 # An exception occurred. We must not call close() because
2468 # it would try to write end-of-archive blocks and padding.
2469 if not self._extfileobj:
2470 self.fileobj.close()
2471 self.closed = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002472
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002473#--------------------
2474# exported functions
2475#--------------------
2476def is_tarfile(name):
2477 """Return True if name points to a tar archive that we
2478 are able to handle, else return False.
William Woodruffdd754ca2020-01-22 21:24:16 -05002479
2480 'name' should be a string, file, or file-like object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002481 """
2482 try:
William Woodruffdd754ca2020-01-22 21:24:16 -05002483 if hasattr(name, "read"):
2484 t = open(fileobj=name)
2485 else:
2486 t = open(name)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002487 t.close()
2488 return True
2489 except TarError:
2490 return False
2491
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002492open = TarFile.open
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002493
2494
2495def main():
2496 import argparse
2497
Serhiy Storchaka150cd192017-04-07 18:56:12 +03002498 description = 'A simple command-line interface for tarfile module.'
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002499 parser = argparse.ArgumentParser(description=description)
2500 parser.add_argument('-v', '--verbose', action='store_true', default=False,
2501 help='Verbose output')
Serhiy Storchaka150cd192017-04-07 18:56:12 +03002502 group = parser.add_mutually_exclusive_group(required=True)
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002503 group.add_argument('-l', '--list', metavar='<tarfile>',
2504 help='Show listing of a tarfile')
2505 group.add_argument('-e', '--extract', nargs='+',
2506 metavar=('<tarfile>', '<output_dir>'),
2507 help='Extract tarfile into target dir')
2508 group.add_argument('-c', '--create', nargs='+',
2509 metavar=('<name>', '<file>'),
2510 help='Create tarfile from sources')
2511 group.add_argument('-t', '--test', metavar='<tarfile>',
2512 help='Test if a tarfile is valid')
2513 args = parser.parse_args()
2514
Serhiy Storchaka150cd192017-04-07 18:56:12 +03002515 if args.test is not None:
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002516 src = args.test
2517 if is_tarfile(src):
2518 with open(src, 'r') as tar:
2519 tar.getmembers()
2520 print(tar.getmembers(), file=sys.stderr)
2521 if args.verbose:
2522 print('{!r} is a tar archive.'.format(src))
2523 else:
2524 parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2525
Serhiy Storchaka150cd192017-04-07 18:56:12 +03002526 elif args.list is not None:
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002527 src = args.list
2528 if is_tarfile(src):
2529 with TarFile.open(src, 'r:*') as tf:
2530 tf.list(verbose=args.verbose)
2531 else:
2532 parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2533
Serhiy Storchaka150cd192017-04-07 18:56:12 +03002534 elif args.extract is not None:
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002535 if len(args.extract) == 1:
2536 src = args.extract[0]
2537 curdir = os.curdir
2538 elif len(args.extract) == 2:
2539 src, curdir = args.extract
2540 else:
2541 parser.exit(1, parser.format_help())
2542
2543 if is_tarfile(src):
2544 with TarFile.open(src, 'r:*') as tf:
2545 tf.extractall(path=curdir)
2546 if args.verbose:
2547 if curdir == '.':
2548 msg = '{!r} file is extracted.'.format(src)
2549 else:
2550 msg = ('{!r} file is extracted '
2551 'into {!r} directory.').format(src, curdir)
2552 print(msg)
2553 else:
2554 parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2555
Serhiy Storchaka150cd192017-04-07 18:56:12 +03002556 elif args.create is not None:
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002557 tar_name = args.create.pop(0)
2558 _, ext = os.path.splitext(tar_name)
2559 compressions = {
2560 # gz
Serhiy Storchaka832dd5f2015-02-10 08:45:53 +02002561 '.gz': 'gz',
2562 '.tgz': 'gz',
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002563 # xz
Serhiy Storchaka832dd5f2015-02-10 08:45:53 +02002564 '.xz': 'xz',
2565 '.txz': 'xz',
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002566 # bz2
Serhiy Storchaka832dd5f2015-02-10 08:45:53 +02002567 '.bz2': 'bz2',
2568 '.tbz': 'bz2',
2569 '.tbz2': 'bz2',
2570 '.tb2': 'bz2',
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002571 }
2572 tar_mode = 'w:' + compressions[ext] if ext in compressions else 'w'
2573 tar_files = args.create
2574
2575 with TarFile.open(tar_name, tar_mode) as tf:
2576 for file_name in tar_files:
2577 tf.add(file_name)
2578
2579 if args.verbose:
2580 print('{!r} file created.'.format(tar_name))
2581
Serhiy Storchakad27b4552013-11-24 01:53:29 +02002582if __name__ == '__main__':
2583 main()