blob: f6d7f7939007e445d5d6291c02b4737503c584a7 [file] [log] [blame]
Benjamin Peterson90f5ba52010-03-11 22:53:45 +00001#!/usr/bin/env python3
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
Christian Heimes9c1257e2007-11-04 11:37:22 +00005# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00006# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
Guido van Rossumd8faa362007-04-27 19:54:29 +000032version = "0.9.0"
Guido van Rossum98297ee2007-11-06 21:34:58 +000033__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)"
Senthil Kumaran7c9719c2011-07-28 22:32:49 +080034__date__ = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $"
35__cvsid__ = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $"
Guido van Rossum98297ee2007-11-06 21:34:58 +000036__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000037
38#---------
39# Imports
40#---------
41import sys
42import os
Eli Bendersky74c503b2012-01-03 06:26:13 +020043import io
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000044import shutil
45import stat
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000046import time
47import struct
Thomas Wouters89f507f2006-12-13 04:49:30 +000048import copy
Guido van Rossumd8faa362007-04-27 19:54:29 +000049import re
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000050
51try:
52 import grp, pwd
53except ImportError:
54 grp = pwd = None
55
Brian Curtin16633fa2010-07-09 13:54:27 +000056# os.symlink on Windows prior to 6.0 raises NotImplementedError
57symlink_exception = (AttributeError, NotImplementedError)
58try:
59 # WindowsError (1314) will be raised if the caller does not hold the
60 # SeCreateSymbolicLinkPrivilege privilege
61 symlink_exception += (WindowsError,)
62except NameError:
63 pass
64
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000065# from tarfile import *
66__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
67
Georg Brandl1a3284e2007-12-02 09:40:06 +000068from builtins import open as _open # Since 'open' is TarFile.open
Guido van Rossum8f78fe92006-08-24 04:03:53 +000069
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000070#---------------------------------------------------------
71# tar constants
72#---------------------------------------------------------
Lars Gustäbelb506dc32007-08-07 18:36:16 +000073NUL = b"\0" # the null character
Guido van Rossumd8faa362007-04-27 19:54:29 +000074BLOCKSIZE = 512 # length of processing blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000075RECORDSIZE = BLOCKSIZE * 20 # length of records
Lars Gustäbelb506dc32007-08-07 18:36:16 +000076GNU_MAGIC = b"ustar \0" # magic gnu tar string
77POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000078
Guido van Rossumd8faa362007-04-27 19:54:29 +000079LENGTH_NAME = 100 # maximum length of a filename
80LENGTH_LINK = 100 # maximum length of a linkname
81LENGTH_PREFIX = 155 # maximum length of the prefix field
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000082
Lars Gustäbelb506dc32007-08-07 18:36:16 +000083REGTYPE = b"0" # regular file
84AREGTYPE = b"\0" # regular file
85LNKTYPE = b"1" # link (inside tarfile)
86SYMTYPE = b"2" # symbolic link
87CHRTYPE = b"3" # character special device
88BLKTYPE = b"4" # block special device
89DIRTYPE = b"5" # directory
90FIFOTYPE = b"6" # fifo special device
91CONTTYPE = b"7" # contiguous file
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000092
Lars Gustäbelb506dc32007-08-07 18:36:16 +000093GNUTYPE_LONGNAME = b"L" # GNU tar longname
94GNUTYPE_LONGLINK = b"K" # GNU tar longlink
95GNUTYPE_SPARSE = b"S" # GNU tar sparse file
Guido van Rossumd8faa362007-04-27 19:54:29 +000096
Lars Gustäbelb506dc32007-08-07 18:36:16 +000097XHDTYPE = b"x" # POSIX.1-2001 extended header
98XGLTYPE = b"g" # POSIX.1-2001 global header
99SOLARIS_XHDTYPE = b"X" # Solaris extended header
Guido van Rossumd8faa362007-04-27 19:54:29 +0000100
101USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
102GNU_FORMAT = 1 # GNU tar format
103PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
104DEFAULT_FORMAT = GNU_FORMAT
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000105
106#---------------------------------------------------------
107# tarfile constants
108#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000109# File types that tarfile supports:
110SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
111 SYMTYPE, DIRTYPE, FIFOTYPE,
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000112 CONTTYPE, CHRTYPE, BLKTYPE,
113 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
114 GNUTYPE_SPARSE)
115
Guido van Rossumd8faa362007-04-27 19:54:29 +0000116# File types that will be treated as a regular file.
117REGULAR_TYPES = (REGTYPE, AREGTYPE,
118 CONTTYPE, GNUTYPE_SPARSE)
119
120# File types that are part of the GNU tar format.
121GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
122 GNUTYPE_SPARSE)
123
124# Fields from a pax header that override a TarInfo attribute.
125PAX_FIELDS = ("path", "linkpath", "size", "mtime",
126 "uid", "gid", "uname", "gname")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000127
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000128# Fields from a pax header that are affected by hdrcharset.
129PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
130
Guido van Rossume7ba4952007-06-06 23:52:48 +0000131# Fields in a pax header that are numbers, all other fields
132# are treated as strings.
133PAX_NUMBER_FIELDS = {
134 "atime": float,
135 "ctime": float,
136 "mtime": float,
137 "uid": int,
138 "gid": int,
139 "size": int
140}
141
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000142#---------------------------------------------------------
143# Bits used in the mode field, values in octal.
144#---------------------------------------------------------
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000145S_IFLNK = 0o120000 # symbolic link
146S_IFREG = 0o100000 # regular file
147S_IFBLK = 0o060000 # block device
148S_IFDIR = 0o040000 # directory
149S_IFCHR = 0o020000 # character device
150S_IFIFO = 0o010000 # fifo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000151
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000152TSUID = 0o4000 # set UID on execution
153TSGID = 0o2000 # set GID on execution
154TSVTX = 0o1000 # reserved
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000155
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000156TUREAD = 0o400 # read by owner
157TUWRITE = 0o200 # write by owner
158TUEXEC = 0o100 # execute/search by owner
159TGREAD = 0o040 # read by group
160TGWRITE = 0o020 # write by group
161TGEXEC = 0o010 # execute/search by group
162TOREAD = 0o004 # read by other
163TOWRITE = 0o002 # write by other
164TOEXEC = 0o001 # execute/search by other
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000165
166#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000167# initialization
168#---------------------------------------------------------
Victor Stinner0f35e2c2010-06-11 23:46:47 +0000169if os.name in ("nt", "ce"):
170 ENCODING = "utf-8"
171else:
172 ENCODING = sys.getfilesystemencoding()
Guido van Rossumd8faa362007-04-27 19:54:29 +0000173
174#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000175# Some useful functions
176#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000177
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000178def stn(s, length, encoding, errors):
179 """Convert a string to a null-terminated bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000180 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000181 s = s.encode(encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000182 return s[:length] + (length - len(s)) * NUL
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000184def nts(s, encoding, errors):
185 """Convert a null-terminated bytes object to a string.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000186 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000187 p = s.find(b"\0")
188 if p != -1:
189 s = s[:p]
190 return s.decode(encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000191
Thomas Wouters477c8d52006-05-27 19:21:47 +0000192def nti(s):
193 """Convert a number field to a python number.
194 """
195 # There are two possible encodings for a number field, see
196 # itn() below.
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200197 if s[0] in (0o200, 0o377):
198 n = 0
199 for i in range(len(s) - 1):
200 n <<= 8
201 n += s[i + 1]
202 if s[0] == 0o377:
203 n = -(256 ** (len(s) - 1) - n)
204 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000205 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000206 n = int(nts(s, "ascii", "strict") or "0", 8)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000207 except ValueError:
Lars Gustäbel9520a432009-11-22 18:48:49 +0000208 raise InvalidHeaderError("invalid header")
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209 return n
210
Guido van Rossumd8faa362007-04-27 19:54:29 +0000211def itn(n, digits=8, format=DEFAULT_FORMAT):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000212 """Convert a python number to a number field.
213 """
214 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
215 # octal digits followed by a null-byte, this allows values up to
216 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200217 # that if necessary. A leading 0o200 or 0o377 byte indicate this
218 # particular encoding, the following digits-1 bytes are a big-endian
219 # base-256 representation. This allows values up to (256**(digits-1))-1.
220 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
221 # number.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000222 if 0 <= n < 8 ** (digits - 1):
Lars Gustäbela280ca752007-08-28 07:34:33 +0000223 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200224 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
225 if n >= 0:
226 s = bytearray([0o200])
227 else:
228 s = bytearray([0o377])
229 n = 256 ** digits + n
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230
Guido van Rossum805365e2007-05-07 22:24:25 +0000231 for i in range(digits - 1):
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200232 s.insert(1, n & 0o377)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233 n >>= 8
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200234 else:
235 raise ValueError("overflow in number field")
236
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237 return s
238
239def calc_chksums(buf):
240 """Calculate the checksum for a member's header by summing up all
241 characters except for the chksum field which is treated as if
242 it was filled with spaces. According to the GNU tar sources,
243 some tars (Sun and NeXT) calculate chksum with signed char,
244 which will be different if there are chars in the buffer with
245 the high bit set. So we calculate two checksums, unsigned and
246 signed.
247 """
Ross Lagerwall468ff4c2012-05-17 19:49:27 +0200248 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
249 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000250 return unsigned_chksum, signed_chksum
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000251
252def copyfileobj(src, dst, length=None):
253 """Copy length bytes from fileobj src to fileobj dst.
254 If length is None, copy the entire content.
255 """
256 if length == 0:
257 return
258 if length is None:
259 shutil.copyfileobj(src, dst)
260 return
261
262 BUFSIZE = 16 * 1024
263 blocks, remainder = divmod(length, BUFSIZE)
Guido van Rossum805365e2007-05-07 22:24:25 +0000264 for b in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000265 buf = src.read(BUFSIZE)
266 if len(buf) < BUFSIZE:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000268 dst.write(buf)
269
270 if remainder != 0:
271 buf = src.read(remainder)
272 if len(buf) < remainder:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000274 dst.write(buf)
275 return
276
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000277def filemode(mode):
Giampaolo Rodola'ffa1d0b2012-05-15 15:30:25 +0200278 """Deprecated in this location; use stat.filemode."""
279 import warnings
280 warnings.warn("deprecated in favor of stat.filemode",
281 DeprecationWarning, 2)
282 return stat.filemode(mode)
283
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +0200284def _safe_print(s):
285 encoding = getattr(sys.stdout, 'encoding', None)
286 if encoding is not None:
287 s = s.encode(encoding, 'backslashreplace').decode(encoding)
288 print(s, end=' ')
289
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000290
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000291class TarError(Exception):
292 """Base exception."""
293 pass
294class ExtractError(TarError):
295 """General exception for extract errors."""
296 pass
297class ReadError(TarError):
Ezio Melotti30b9d5d2013-08-17 15:50:46 +0300298 """Exception for unreadable tar archives."""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000299 pass
300class CompressionError(TarError):
301 """Exception for unavailable compression methods."""
302 pass
303class StreamError(TarError):
304 """Exception for unsupported operations on stream-like TarFiles."""
305 pass
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000306class HeaderError(TarError):
Lars Gustäbel9520a432009-11-22 18:48:49 +0000307 """Base exception for header errors."""
308 pass
309class EmptyHeaderError(HeaderError):
310 """Exception for empty headers."""
311 pass
312class TruncatedHeaderError(HeaderError):
313 """Exception for truncated headers."""
314 pass
315class EOFHeaderError(HeaderError):
316 """Exception for end of file headers."""
317 pass
318class InvalidHeaderError(HeaderError):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000319 """Exception for invalid headers."""
320 pass
Lars Gustäbel9520a432009-11-22 18:48:49 +0000321class SubsequentHeaderError(HeaderError):
322 """Exception for missing and invalid extended headers."""
323 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000324
325#---------------------------
326# internal stream interface
327#---------------------------
328class _LowLevelFile:
329 """Low-level file object. Supports reading and writing.
330 It is used instead of a regular file object for streaming
331 access.
332 """
333
334 def __init__(self, name, mode):
335 mode = {
336 "r": os.O_RDONLY,
337 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
338 }[mode]
339 if hasattr(os, "O_BINARY"):
340 mode |= os.O_BINARY
Lars Gustäbeld6eb70b2010-04-29 15:37:02 +0000341 self.fd = os.open(name, mode, 0o666)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000342
343 def close(self):
344 os.close(self.fd)
345
346 def read(self, size):
347 return os.read(self.fd, size)
348
349 def write(self, s):
350 os.write(self.fd, s)
351
352class _Stream:
353 """Class that serves as an adapter between TarFile and
354 a stream-like object. The stream-like object only
355 needs to have a read() or write() method and is accessed
356 blockwise. Use of gzip or bzip2 compression is possible.
357 A stream-like object could be for example: sys.stdin,
358 sys.stdout, a socket, a tape device etc.
359
360 _Stream is intended to be used only internally.
361 """
362
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000363 def __init__(self, name, mode, comptype, fileobj, bufsize):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000364 """Construct a _Stream object.
365 """
366 self._extfileobj = True
367 if fileobj is None:
368 fileobj = _LowLevelFile(name, mode)
369 self._extfileobj = False
370
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000371 if comptype == '*':
372 # Enable transparent compression detection for the
373 # stream interface
374 fileobj = _StreamProxy(fileobj)
375 comptype = fileobj.getcomptype()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000376
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000377 self.name = name or ""
378 self.mode = mode
379 self.comptype = comptype
380 self.fileobj = fileobj
381 self.bufsize = bufsize
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000382 self.buf = b""
Guido van Rossume2a383d2007-01-15 16:59:06 +0000383 self.pos = 0
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000384 self.closed = False
385
Antoine Pitrou605c2932010-09-23 20:15:14 +0000386 try:
387 if comptype == "gz":
388 try:
389 import zlib
390 except ImportError:
391 raise CompressionError("zlib module is not available")
392 self.zlib = zlib
393 self.crc = zlib.crc32(b"")
394 if mode == "r":
395 self._init_read_gz()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100396 self.exception = zlib.error
Antoine Pitrou605c2932010-09-23 20:15:14 +0000397 else:
398 self._init_write_gz()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000399
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100400 elif comptype == "bz2":
Antoine Pitrou605c2932010-09-23 20:15:14 +0000401 try:
402 import bz2
403 except ImportError:
404 raise CompressionError("bz2 module is not available")
405 if mode == "r":
406 self.dbuf = b""
407 self.cmp = bz2.BZ2Decompressor()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100408 self.exception = IOError
Antoine Pitrou605c2932010-09-23 20:15:14 +0000409 else:
410 self.cmp = bz2.BZ2Compressor()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100411
412 elif comptype == "xz":
413 try:
414 import lzma
415 except ImportError:
416 raise CompressionError("lzma module is not available")
417 if mode == "r":
418 self.dbuf = b""
419 self.cmp = lzma.LZMADecompressor()
420 self.exception = lzma.LZMAError
421 else:
422 self.cmp = lzma.LZMACompressor()
423
424 elif comptype != "tar":
425 raise CompressionError("unknown compression type %r" % comptype)
426
Antoine Pitrou605c2932010-09-23 20:15:14 +0000427 except:
428 if not self._extfileobj:
429 self.fileobj.close()
430 self.closed = True
431 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000432
433 def __del__(self):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000434 if hasattr(self, "closed") and not self.closed:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000435 self.close()
436
437 def _init_write_gz(self):
438 """Initialize for writing with gzip compression.
439 """
440 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
441 -self.zlib.MAX_WBITS,
442 self.zlib.DEF_MEM_LEVEL,
443 0)
Guido van Rossume2a383d2007-01-15 16:59:06 +0000444 timestamp = struct.pack("<L", int(time.time()))
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000445 self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000446 if self.name.endswith(".gz"):
447 self.name = self.name[:-3]
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000448 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
449 self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000450
451 def write(self, s):
452 """Write string s to the stream.
453 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000454 if self.comptype == "gz":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000455 self.crc = self.zlib.crc32(s, self.crc)
456 self.pos += len(s)
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000457 if self.comptype != "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000458 s = self.cmp.compress(s)
459 self.__write(s)
460
461 def __write(self, s):
462 """Write string s to the stream if a whole new block
463 is ready to be written.
464 """
465 self.buf += s
466 while len(self.buf) > self.bufsize:
467 self.fileobj.write(self.buf[:self.bufsize])
468 self.buf = self.buf[self.bufsize:]
469
470 def close(self):
471 """Close the _Stream object. No operation should be
472 done on it afterwards.
473 """
474 if self.closed:
475 return
476
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000477 if self.mode == "w" and self.comptype != "tar":
Martin v. Löwisc234a522004-08-22 21:28:33 +0000478 self.buf += self.cmp.flush()
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000479
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000480 if self.mode == "w" and self.buf:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000481 self.fileobj.write(self.buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000482 self.buf = b""
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000483 if self.comptype == "gz":
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000484 # The native zlib crc is an unsigned 32-bit integer, but
485 # the Python wrapper implicitly casts that to a signed C
486 # long. So, on a 32-bit box self.crc may "look negative",
487 # while the same crc on a 64-bit box may "look positive".
488 # To avoid irksome warnings from the `struct` module, force
489 # it to look positive on all boxes.
Guido van Rossume2a383d2007-01-15 16:59:06 +0000490 self.fileobj.write(struct.pack("<L", self.crc & 0xffffffff))
491 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000492
493 if not self._extfileobj:
494 self.fileobj.close()
495
496 self.closed = True
497
498 def _init_read_gz(self):
499 """Initialize for reading a gzip compressed fileobj.
500 """
501 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000502 self.dbuf = b""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000503
504 # taken from gzip.GzipFile with some alterations
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000505 if self.__read(2) != b"\037\213":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000506 raise ReadError("not a gzip file")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000507 if self.__read(1) != b"\010":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000508 raise CompressionError("unsupported compression method")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000509
510 flag = ord(self.__read(1))
511 self.__read(6)
512
513 if flag & 4:
514 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
515 self.read(xlen)
516 if flag & 8:
517 while True:
518 s = self.__read(1)
519 if not s or s == NUL:
520 break
521 if flag & 16:
522 while True:
523 s = self.__read(1)
524 if not s or s == NUL:
525 break
526 if flag & 2:
527 self.__read(2)
528
529 def tell(self):
530 """Return the stream's file pointer position.
531 """
532 return self.pos
533
534 def seek(self, pos=0):
535 """Set the stream's file pointer to pos. Negative seeking
536 is forbidden.
537 """
538 if pos - self.pos >= 0:
539 blocks, remainder = divmod(pos - self.pos, self.bufsize)
Guido van Rossum805365e2007-05-07 22:24:25 +0000540 for i in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000541 self.read(self.bufsize)
542 self.read(remainder)
543 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544 raise StreamError("seeking backwards is not allowed")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000545 return self.pos
546
547 def read(self, size=None):
548 """Return the next size number of bytes from the stream.
549 If size is not defined, return all bytes of the stream
550 up to EOF.
551 """
552 if size is None:
553 t = []
554 while True:
555 buf = self._read(self.bufsize)
556 if not buf:
557 break
558 t.append(buf)
559 buf = "".join(t)
560 else:
561 buf = self._read(size)
562 self.pos += len(buf)
563 return buf
564
565 def _read(self, size):
566 """Return size bytes from the stream.
567 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000568 if self.comptype == "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000569 return self.__read(size)
570
571 c = len(self.dbuf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000572 while c < size:
573 buf = self.__read(self.bufsize)
574 if not buf:
575 break
Guido van Rossumd8faa362007-04-27 19:54:29 +0000576 try:
577 buf = self.cmp.decompress(buf)
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100578 except self.exception:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000579 raise ReadError("invalid compressed data")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000580 self.dbuf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000581 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000582 buf = self.dbuf[:size]
583 self.dbuf = self.dbuf[size:]
584 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000585
586 def __read(self, size):
587 """Return size bytes from stream. If internal buffer is empty,
588 read another block from the stream.
589 """
590 c = len(self.buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000591 while c < size:
592 buf = self.fileobj.read(self.bufsize)
593 if not buf:
594 break
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000595 self.buf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000596 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000597 buf = self.buf[:size]
598 self.buf = self.buf[size:]
599 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000600# class _Stream
601
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000602class _StreamProxy(object):
603 """Small proxy class that enables transparent compression
604 detection for the Stream interface (mode 'r|*').
605 """
606
607 def __init__(self, fileobj):
608 self.fileobj = fileobj
609 self.buf = self.fileobj.read(BLOCKSIZE)
610
611 def read(self, size):
612 self.read = self.fileobj.read
613 return self.buf
614
615 def getcomptype(self):
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100616 if self.buf.startswith(b"\x1f\x8b\x08"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000617 return "gz"
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100618 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000619 return "bz2"
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100620 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
621 return "xz"
622 else:
623 return "tar"
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000624
625 def close(self):
626 self.fileobj.close()
627# class StreamProxy
628
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000629#------------------------
630# Extraction file object
631#------------------------
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000632class _FileInFile(object):
633 """A thin wrapper around an existing file object that
634 provides a part of its data as an individual file
635 object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000636 """
637
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000638 def __init__(self, fileobj, offset, size, blockinfo=None):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000639 self.fileobj = fileobj
640 self.offset = offset
641 self.size = size
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000642 self.position = 0
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200643 self.name = getattr(fileobj, "name", None)
644 self.closed = False
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000645
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000646 if blockinfo is None:
647 blockinfo = [(0, size)]
648
649 # Construct a map with data and zero blocks.
650 self.map_index = 0
651 self.map = []
652 lastpos = 0
653 realpos = self.offset
654 for offset, size in blockinfo:
655 if offset > lastpos:
656 self.map.append((False, lastpos, offset, None))
657 self.map.append((True, offset, offset + size, realpos))
658 realpos += size
659 lastpos = offset + size
660 if lastpos < self.size:
661 self.map.append((False, lastpos, self.size, None))
662
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200663 def flush(self):
664 pass
665
666 def readable(self):
667 return True
668
669 def writable(self):
670 return False
671
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000672 def seekable(self):
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000673 return self.fileobj.seekable()
674
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000675 def tell(self):
676 """Return the current file position.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000677 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000678 return self.position
679
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200680 def seek(self, position, whence=io.SEEK_SET):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000681 """Seek to a position in the file.
682 """
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200683 if whence == io.SEEK_SET:
684 self.position = min(max(position, 0), self.size)
685 elif whence == io.SEEK_CUR:
686 if position < 0:
687 self.position = max(self.position + position, 0)
688 else:
689 self.position = min(self.position + position, self.size)
690 elif whence == io.SEEK_END:
691 self.position = max(min(self.size + position, self.size), 0)
692 else:
693 raise ValueError("Invalid argument")
694 return self.position
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000695
696 def read(self, size=None):
697 """Read data from the file.
698 """
699 if size is None:
700 size = self.size - self.position
701 else:
702 size = min(size, self.size - self.position)
703
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000704 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000705 while size > 0:
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000706 while True:
707 data, start, stop, offset = self.map[self.map_index]
708 if start <= self.position < stop:
709 break
710 else:
711 self.map_index += 1
712 if self.map_index == len(self.map):
713 self.map_index = 0
714 length = min(size, stop - self.position)
715 if data:
Lars Gustäbeldd071042011-02-23 11:42:22 +0000716 self.fileobj.seek(offset + (self.position - start))
717 buf += self.fileobj.read(length)
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000718 else:
719 buf += NUL * length
720 size -= length
721 self.position += length
722 return buf
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000723
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200724 def readinto(self, b):
725 buf = self.read(len(b))
726 b[:len(buf)] = buf
727 return len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000728
729 def close(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000730 self.closed = True
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200731#class _FileInFile
Martin v. Löwisdf241532005-03-03 08:17:42 +0000732
Lars Gustäbelb062a2f2012-05-14 13:18:16 +0200733class ExFileObject(io.BufferedReader):
734
735 def __init__(self, tarfile, tarinfo):
736 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
737 tarinfo.size, tarinfo.sparse)
738 super().__init__(fileobj)
739#class ExFileObject
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000740
741#------------------
742# Exported Classes
743#------------------
744class TarInfo(object):
745 """Informational class which holds the details about an
746 archive member given by a tar header block.
747 TarInfo objects are returned by TarFile.getmember(),
748 TarFile.getmembers() and TarFile.gettarinfo() and are
749 usually created internally.
750 """
751
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000752 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
753 "chksum", "type", "linkname", "uname", "gname",
754 "devmajor", "devminor",
755 "offset", "offset_data", "pax_headers", "sparse",
756 "tarfile", "_sparse_structs", "_link_target")
757
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000758 def __init__(self, name=""):
759 """Construct a TarInfo object. name is the optional name
760 of the member.
761 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000762 self.name = name # member name
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000763 self.mode = 0o644 # file permissions
Thomas Wouters477c8d52006-05-27 19:21:47 +0000764 self.uid = 0 # user id
765 self.gid = 0 # group id
766 self.size = 0 # file size
767 self.mtime = 0 # modification time
768 self.chksum = 0 # header checksum
769 self.type = REGTYPE # member type
770 self.linkname = "" # link name
Lars Gustäbel331b8002010-10-04 15:18:47 +0000771 self.uname = "" # user name
772 self.gname = "" # group name
Thomas Wouters477c8d52006-05-27 19:21:47 +0000773 self.devmajor = 0 # device major number
774 self.devminor = 0 # device minor number
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000775
Thomas Wouters477c8d52006-05-27 19:21:47 +0000776 self.offset = 0 # the tar header starts here
777 self.offset_data = 0 # the file's data starts here
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000778
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000779 self.sparse = None # sparse member information
Guido van Rossumd8faa362007-04-27 19:54:29 +0000780 self.pax_headers = {} # pax header information
781
782 # In pax headers the "name" and "linkname" field are called
783 # "path" and "linkpath".
784 def _getpath(self):
785 return self.name
786 def _setpath(self, name):
787 self.name = name
788 path = property(_getpath, _setpath)
789
790 def _getlinkpath(self):
791 return self.linkname
792 def _setlinkpath(self, linkname):
793 self.linkname = linkname
794 linkpath = property(_getlinkpath, _setlinkpath)
795
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000796 def __repr__(self):
797 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
798
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000799 def get_info(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000800 """Return the TarInfo's attributes as a dictionary.
801 """
802 info = {
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000803 "name": self.name,
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000804 "mode": self.mode & 0o7777,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000805 "uid": self.uid,
806 "gid": self.gid,
807 "size": self.size,
808 "mtime": self.mtime,
809 "chksum": self.chksum,
810 "type": self.type,
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000811 "linkname": self.linkname,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000812 "uname": self.uname,
813 "gname": self.gname,
814 "devmajor": self.devmajor,
815 "devminor": self.devminor
816 }
817
818 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
819 info["name"] += "/"
820
821 return info
822
Victor Stinnerde629d42010-05-05 21:43:57 +0000823 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000824 """Return a tar header as a string of 512 byte blocks.
825 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000826 info = self.get_info()
Guido van Rossume7ba4952007-06-06 23:52:48 +0000827
Guido van Rossumd8faa362007-04-27 19:54:29 +0000828 if format == USTAR_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000829 return self.create_ustar_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000830 elif format == GNU_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000831 return self.create_gnu_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000832 elif format == PAX_FORMAT:
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000833 return self.create_pax_header(info, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000834 else:
835 raise ValueError("invalid format")
836
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000837 def create_ustar_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000838 """Return the object as a ustar header block.
839 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000840 info["magic"] = POSIX_MAGIC
841
842 if len(info["linkname"]) > LENGTH_LINK:
843 raise ValueError("linkname is too long")
844
845 if len(info["name"]) > LENGTH_NAME:
846 info["prefix"], info["name"] = self._posix_split_name(info["name"])
847
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000848 return self._create_header(info, USTAR_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000849
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000850 def create_gnu_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000851 """Return the object as a GNU header block sequence.
852 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000853 info["magic"] = GNU_MAGIC
854
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000855 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +0000856 if len(info["linkname"]) > LENGTH_LINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000857 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000858
859 if len(info["name"]) > LENGTH_NAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000860 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000861
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000862 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000863
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000864 def create_pax_header(self, info, encoding):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000865 """Return the object as a ustar header block. If it cannot be
866 represented this way, prepend a pax extended header sequence
867 with supplement information.
868 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000869 info["magic"] = POSIX_MAGIC
870 pax_headers = self.pax_headers.copy()
871
872 # Test string fields for values that exceed the field length or cannot
873 # be represented in ASCII encoding.
874 for name, hname, length in (
875 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
876 ("uname", "uname", 32), ("gname", "gname", 32)):
877
Guido van Rossume7ba4952007-06-06 23:52:48 +0000878 if hname in pax_headers:
879 # The pax header has priority.
880 continue
881
Guido van Rossumd8faa362007-04-27 19:54:29 +0000882 # Try to encode the string as ASCII.
883 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000884 info[name].encode("ascii", "strict")
Guido van Rossumd8faa362007-04-27 19:54:29 +0000885 except UnicodeEncodeError:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000886 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +0000887 continue
888
Guido van Rossume7ba4952007-06-06 23:52:48 +0000889 if len(info[name]) > length:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000890 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +0000891
892 # Test number fields for values that exceed the field limit or values
893 # that like to be stored as float.
894 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
Guido van Rossume7ba4952007-06-06 23:52:48 +0000895 if name in pax_headers:
896 # The pax header has priority. Avoid overflow.
897 info[name] = 0
898 continue
899
Guido van Rossumd8faa362007-04-27 19:54:29 +0000900 val = info[name]
901 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000902 pax_headers[name] = str(val)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000903 info[name] = 0
904
Guido van Rossume7ba4952007-06-06 23:52:48 +0000905 # Create a pax extended header if necessary.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000906 if pax_headers:
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000907 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000908 else:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000909 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +0000910
Lars Gustäbel3741eff2007-08-21 12:17:05 +0000911 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
Guido van Rossumd8faa362007-04-27 19:54:29 +0000912
913 @classmethod
Lars Gustäbel3741eff2007-08-21 12:17:05 +0000914 def create_pax_global_header(cls, pax_headers):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000915 """Return the object as a pax global header block sequence.
916 """
Marc-André Lemburg8f36af72011-02-25 15:42:01 +0000917 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
Guido van Rossumd8faa362007-04-27 19:54:29 +0000918
919 def _posix_split_name(self, name):
920 """Split a name longer than 100 chars into a prefix
921 and a name part.
922 """
923 prefix = name[:LENGTH_PREFIX + 1]
924 while prefix and prefix[-1] != "/":
925 prefix = prefix[:-1]
926
927 name = name[len(prefix):]
928 prefix = prefix[:-1]
929
930 if not prefix or len(name) > LENGTH_NAME:
931 raise ValueError("name is too long")
932 return prefix, name
933
934 @staticmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000935 def _create_header(info, format, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000936 """Return a header block. info is a dictionary with file
937 information, format must be one of the *_FORMAT constants.
938 """
939 parts = [
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000940 stn(info.get("name", ""), 100, encoding, errors),
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000941 itn(info.get("mode", 0) & 0o7777, 8, format),
Guido van Rossumd8faa362007-04-27 19:54:29 +0000942 itn(info.get("uid", 0), 8, format),
943 itn(info.get("gid", 0), 8, format),
944 itn(info.get("size", 0), 12, format),
945 itn(info.get("mtime", 0), 12, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000946 b" ", # checksum field
Guido van Rossumd8faa362007-04-27 19:54:29 +0000947 info.get("type", REGTYPE),
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000948 stn(info.get("linkname", ""), 100, encoding, errors),
949 info.get("magic", POSIX_MAGIC),
Lars Gustäbel331b8002010-10-04 15:18:47 +0000950 stn(info.get("uname", ""), 32, encoding, errors),
951 stn(info.get("gname", ""), 32, encoding, errors),
Guido van Rossumd8faa362007-04-27 19:54:29 +0000952 itn(info.get("devmajor", 0), 8, format),
953 itn(info.get("devminor", 0), 8, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000954 stn(info.get("prefix", ""), 155, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000955 ]
956
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000957 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
Guido van Rossumd8faa362007-04-27 19:54:29 +0000958 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
Lars Gustäbela280ca752007-08-28 07:34:33 +0000959 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
Guido van Rossumd8faa362007-04-27 19:54:29 +0000960 return buf
961
962 @staticmethod
963 def _create_payload(payload):
964 """Return the string payload filled with zero bytes
965 up to the next 512 byte border.
966 """
967 blocks, remainder = divmod(len(payload), BLOCKSIZE)
968 if remainder > 0:
969 payload += (BLOCKSIZE - remainder) * NUL
970 return payload
971
972 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000973 def _create_gnu_long_header(cls, name, type, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000974 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
975 for name.
976 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000977 name = name.encode(encoding, errors) + NUL
Guido van Rossumd8faa362007-04-27 19:54:29 +0000978
979 info = {}
980 info["name"] = "././@LongLink"
981 info["type"] = type
982 info["size"] = len(name)
983 info["magic"] = GNU_MAGIC
984
985 # create extended header + name blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000986 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
Guido van Rossumd8faa362007-04-27 19:54:29 +0000987 cls._create_payload(name)
988
989 @classmethod
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000990 def _create_pax_generic_header(cls, pax_headers, type, encoding):
991 """Return a POSIX.1-2008 extended or global header sequence
Guido van Rossumd8faa362007-04-27 19:54:29 +0000992 that contains a list of keyword, value pairs. The values
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000993 must be strings.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000994 """
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000995 # Check if one of the fields contains surrogate characters and thereby
996 # forces hdrcharset=BINARY, see _proc_pax() for more information.
997 binary = False
998 for keyword, value in pax_headers.items():
999 try:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001000 value.encode("utf-8", "strict")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001001 except UnicodeEncodeError:
1002 binary = True
1003 break
1004
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001005 records = b""
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001006 if binary:
1007 # Put the hdrcharset field at the beginning of the header.
1008 records += b"21 hdrcharset=BINARY\n"
1009
Guido van Rossumd8faa362007-04-27 19:54:29 +00001010 for keyword, value in pax_headers.items():
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001011 keyword = keyword.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001012 if binary:
1013 # Try to restore the original byte representation of `value'.
1014 # Needless to say, that the encoding must match the string.
1015 value = value.encode(encoding, "surrogateescape")
1016 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001017 value = value.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001018
Guido van Rossumd8faa362007-04-27 19:54:29 +00001019 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1020 n = p = 0
1021 while True:
1022 n = l + len(str(p))
1023 if n == p:
1024 break
1025 p = n
Lars Gustäbela280ca752007-08-28 07:34:33 +00001026 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
Guido van Rossumd8faa362007-04-27 19:54:29 +00001027
1028 # We use a hardcoded "././@PaxHeader" name like star does
1029 # instead of the one that POSIX recommends.
1030 info = {}
1031 info["name"] = "././@PaxHeader"
1032 info["type"] = type
1033 info["size"] = len(records)
1034 info["magic"] = POSIX_MAGIC
1035
1036 # Create pax header + record blocks.
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001037 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001038 cls._create_payload(records)
1039
Guido van Rossum75b64e62005-01-16 00:16:11 +00001040 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001041 def frombuf(cls, buf, encoding, errors):
1042 """Construct a TarInfo object from a 512 byte bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001043 """
Lars Gustäbel9520a432009-11-22 18:48:49 +00001044 if len(buf) == 0:
1045 raise EmptyHeaderError("empty header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001046 if len(buf) != BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001047 raise TruncatedHeaderError("truncated header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001048 if buf.count(NUL) == BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001049 raise EOFHeaderError("end of file header")
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001050
1051 chksum = nti(buf[148:156])
1052 if chksum not in calc_chksums(buf):
Lars Gustäbel9520a432009-11-22 18:48:49 +00001053 raise InvalidHeaderError("bad checksum")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001054
Guido van Rossumd8faa362007-04-27 19:54:29 +00001055 obj = cls()
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001056 obj.name = nts(buf[0:100], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001057 obj.mode = nti(buf[100:108])
1058 obj.uid = nti(buf[108:116])
1059 obj.gid = nti(buf[116:124])
1060 obj.size = nti(buf[124:136])
1061 obj.mtime = nti(buf[136:148])
1062 obj.chksum = chksum
1063 obj.type = buf[156:157]
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001064 obj.linkname = nts(buf[157:257], encoding, errors)
1065 obj.uname = nts(buf[265:297], encoding, errors)
1066 obj.gname = nts(buf[297:329], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001067 obj.devmajor = nti(buf[329:337])
1068 obj.devminor = nti(buf[337:345])
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001069 prefix = nts(buf[345:500], encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001070
Guido van Rossumd8faa362007-04-27 19:54:29 +00001071 # Old V7 tar format represents a directory as a regular
1072 # file with a trailing slash.
1073 if obj.type == AREGTYPE and obj.name.endswith("/"):
1074 obj.type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001075
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001076 # The old GNU sparse format occupies some of the unused
1077 # space in the buffer for up to 4 sparse structures.
1078 # Save the them for later processing in _proc_sparse().
1079 if obj.type == GNUTYPE_SPARSE:
1080 pos = 386
1081 structs = []
1082 for i in range(4):
1083 try:
1084 offset = nti(buf[pos:pos + 12])
1085 numbytes = nti(buf[pos + 12:pos + 24])
1086 except ValueError:
1087 break
1088 structs.append((offset, numbytes))
1089 pos += 24
1090 isextended = bool(buf[482])
1091 origsize = nti(buf[483:495])
1092 obj._sparse_structs = (structs, isextended, origsize)
1093
Guido van Rossumd8faa362007-04-27 19:54:29 +00001094 # Remove redundant slashes from directories.
1095 if obj.isdir():
1096 obj.name = obj.name.rstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001097
Guido van Rossumd8faa362007-04-27 19:54:29 +00001098 # Reconstruct a ustar longname.
1099 if prefix and obj.type not in GNU_TYPES:
1100 obj.name = prefix + "/" + obj.name
1101 return obj
1102
1103 @classmethod
1104 def fromtarfile(cls, tarfile):
1105 """Return the next TarInfo object from TarFile object
1106 tarfile.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001107 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001108 buf = tarfile.fileobj.read(BLOCKSIZE)
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001109 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001110 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1111 return obj._proc_member(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001112
Guido van Rossumd8faa362007-04-27 19:54:29 +00001113 #--------------------------------------------------------------------------
1114 # The following are methods that are called depending on the type of a
1115 # member. The entry point is _proc_member() which can be overridden in a
1116 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1117 # implement the following
1118 # operations:
1119 # 1. Set self.offset_data to the position where the data blocks begin,
1120 # if there is data that follows.
1121 # 2. Set tarfile.offset to the position where the next member's header will
1122 # begin.
1123 # 3. Return self or another valid TarInfo object.
1124 def _proc_member(self, tarfile):
1125 """Choose the right processing method depending on
1126 the type and call it.
Thomas Wouters89f507f2006-12-13 04:49:30 +00001127 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001128 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1129 return self._proc_gnulong(tarfile)
1130 elif self.type == GNUTYPE_SPARSE:
1131 return self._proc_sparse(tarfile)
1132 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1133 return self._proc_pax(tarfile)
1134 else:
1135 return self._proc_builtin(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001136
Guido van Rossumd8faa362007-04-27 19:54:29 +00001137 def _proc_builtin(self, tarfile):
1138 """Process a builtin type or an unknown type which
1139 will be treated as a regular file.
1140 """
1141 self.offset_data = tarfile.fileobj.tell()
1142 offset = self.offset_data
1143 if self.isreg() or self.type not in SUPPORTED_TYPES:
1144 # Skip the following data blocks.
1145 offset += self._block(self.size)
1146 tarfile.offset = offset
Thomas Wouters89f507f2006-12-13 04:49:30 +00001147
Guido van Rossume7ba4952007-06-06 23:52:48 +00001148 # Patch the TarInfo object with saved global
Guido van Rossumd8faa362007-04-27 19:54:29 +00001149 # header information.
Guido van Rossume7ba4952007-06-06 23:52:48 +00001150 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001151
1152 return self
1153
1154 def _proc_gnulong(self, tarfile):
1155 """Process the blocks that hold a GNU longname
1156 or longlink member.
1157 """
1158 buf = tarfile.fileobj.read(self._block(self.size))
1159
1160 # Fetch the next header and process it.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001161 try:
1162 next = self.fromtarfile(tarfile)
1163 except HeaderError:
1164 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001165
1166 # Patch the TarInfo object from the next header with
1167 # the longname information.
1168 next.offset = self.offset
1169 if self.type == GNUTYPE_LONGNAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001170 next.name = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001171 elif self.type == GNUTYPE_LONGLINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001172 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001173
1174 return next
1175
1176 def _proc_sparse(self, tarfile):
1177 """Process a GNU sparse header plus extra headers.
1178 """
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001179 # We already collected some sparse structures in frombuf().
1180 structs, isextended, origsize = self._sparse_structs
1181 del self._sparse_structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001182
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001183 # Collect sparse structures from extended header blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001184 while isextended:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001185 buf = tarfile.fileobj.read(BLOCKSIZE)
1186 pos = 0
Guido van Rossum805365e2007-05-07 22:24:25 +00001187 for i in range(21):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001188 try:
1189 offset = nti(buf[pos:pos + 12])
1190 numbytes = nti(buf[pos + 12:pos + 24])
1191 except ValueError:
1192 break
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001193 if offset and numbytes:
1194 structs.append((offset, numbytes))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001195 pos += 24
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001196 isextended = bool(buf[504])
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001197 self.sparse = structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001198
1199 self.offset_data = tarfile.fileobj.tell()
1200 tarfile.offset = self.offset_data + self._block(self.size)
1201 self.size = origsize
Guido van Rossumd8faa362007-04-27 19:54:29 +00001202 return self
1203
1204 def _proc_pax(self, tarfile):
1205 """Process an extended or global header as described in
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001206 POSIX.1-2008.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001207 """
1208 # Read the header information.
1209 buf = tarfile.fileobj.read(self._block(self.size))
1210
1211 # A pax header stores supplemental information for either
1212 # the following file (extended) or all following files
1213 # (global).
1214 if self.type == XGLTYPE:
1215 pax_headers = tarfile.pax_headers
1216 else:
1217 pax_headers = tarfile.pax_headers.copy()
1218
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001219 # Check if the pax header contains a hdrcharset field. This tells us
1220 # the encoding of the path, linkpath, uname and gname fields. Normally,
1221 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1222 # implementations are allowed to store them as raw binary strings if
1223 # the translation to UTF-8 fails.
1224 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1225 if match is not None:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001226 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001227
1228 # For the time being, we don't care about anything other than "BINARY".
1229 # The only other value that is currently allowed by the standard is
1230 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1231 hdrcharset = pax_headers.get("hdrcharset")
1232 if hdrcharset == "BINARY":
1233 encoding = tarfile.encoding
1234 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001235 encoding = "utf-8"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001236
Guido van Rossumd8faa362007-04-27 19:54:29 +00001237 # Parse pax header information. A record looks like that:
1238 # "%d %s=%s\n" % (length, keyword, value). length is the size
1239 # of the complete record including the length field itself and
Guido van Rossume7ba4952007-06-06 23:52:48 +00001240 # the newline. keyword and value are both UTF-8 encoded strings.
Antoine Pitroufd036452008-08-19 17:56:33 +00001241 regex = re.compile(br"(\d+) ([^=]+)=")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001242 pos = 0
1243 while True:
1244 match = regex.match(buf, pos)
1245 if not match:
1246 break
1247
1248 length, keyword = match.groups()
1249 length = int(length)
1250 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1251
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001252 # Normally, we could just use "utf-8" as the encoding and "strict"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001253 # as the error handler, but we better not take the risk. For
1254 # example, GNU tar <= 1.23 is known to store filenames it cannot
1255 # translate to UTF-8 as raw strings (unfortunately without a
1256 # hdrcharset=BINARY header).
1257 # We first try the strict standard encoding, and if that fails we
1258 # fall back on the user's encoding and error handler.
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001259 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001260 tarfile.errors)
1261 if keyword in PAX_NAME_FIELDS:
1262 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1263 tarfile.errors)
1264 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001265 value = self._decode_pax_field(value, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001266 tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001267
1268 pax_headers[keyword] = value
1269 pos += length
1270
Guido van Rossume7ba4952007-06-06 23:52:48 +00001271 # Fetch the next header.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001272 try:
1273 next = self.fromtarfile(tarfile)
1274 except HeaderError:
1275 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001276
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001277 # Process GNU sparse information.
1278 if "GNU.sparse.map" in pax_headers:
1279 # GNU extended sparse format version 0.1.
1280 self._proc_gnusparse_01(next, pax_headers)
1281
1282 elif "GNU.sparse.size" in pax_headers:
1283 # GNU extended sparse format version 0.0.
1284 self._proc_gnusparse_00(next, pax_headers, buf)
1285
1286 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1287 # GNU extended sparse format version 1.0.
1288 self._proc_gnusparse_10(next, pax_headers, tarfile)
1289
Guido van Rossume7ba4952007-06-06 23:52:48 +00001290 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001291 # Patch the TarInfo object with the extended header info.
1292 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1293 next.offset = self.offset
1294
1295 if "size" in pax_headers:
1296 # If the extended header replaces the size field,
1297 # we need to recalculate the offset where the next
1298 # header starts.
1299 offset = next.offset_data
1300 if next.isreg() or next.type not in SUPPORTED_TYPES:
1301 offset += next._block(next.size)
1302 tarfile.offset = offset
1303
1304 return next
1305
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001306 def _proc_gnusparse_00(self, next, pax_headers, buf):
1307 """Process a GNU tar extended sparse header, version 0.0.
1308 """
1309 offsets = []
1310 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1311 offsets.append(int(match.group(1)))
1312 numbytes = []
1313 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1314 numbytes.append(int(match.group(1)))
1315 next.sparse = list(zip(offsets, numbytes))
1316
1317 def _proc_gnusparse_01(self, next, pax_headers):
1318 """Process a GNU tar extended sparse header, version 0.1.
1319 """
1320 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1321 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1322
1323 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1324 """Process a GNU tar extended sparse header, version 1.0.
1325 """
1326 fields = None
1327 sparse = []
1328 buf = tarfile.fileobj.read(BLOCKSIZE)
1329 fields, buf = buf.split(b"\n", 1)
1330 fields = int(fields)
1331 while len(sparse) < fields * 2:
1332 if b"\n" not in buf:
1333 buf += tarfile.fileobj.read(BLOCKSIZE)
1334 number, buf = buf.split(b"\n", 1)
1335 sparse.append(int(number))
1336 next.offset_data = tarfile.fileobj.tell()
1337 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1338
Guido van Rossume7ba4952007-06-06 23:52:48 +00001339 def _apply_pax_info(self, pax_headers, encoding, errors):
1340 """Replace fields with supplemental information from a previous
1341 pax extended or global header.
1342 """
1343 for keyword, value in pax_headers.items():
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001344 if keyword == "GNU.sparse.name":
1345 setattr(self, "path", value)
1346 elif keyword == "GNU.sparse.size":
1347 setattr(self, "size", int(value))
1348 elif keyword == "GNU.sparse.realsize":
1349 setattr(self, "size", int(value))
1350 elif keyword in PAX_FIELDS:
1351 if keyword in PAX_NUMBER_FIELDS:
1352 try:
1353 value = PAX_NUMBER_FIELDS[keyword](value)
1354 except ValueError:
1355 value = 0
1356 if keyword == "path":
1357 value = value.rstrip("/")
1358 setattr(self, keyword, value)
Guido van Rossume7ba4952007-06-06 23:52:48 +00001359
1360 self.pax_headers = pax_headers.copy()
Guido van Rossumd8faa362007-04-27 19:54:29 +00001361
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001362 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1363 """Decode a single field from a pax record.
1364 """
1365 try:
1366 return value.decode(encoding, "strict")
1367 except UnicodeDecodeError:
1368 return value.decode(fallback_encoding, fallback_errors)
1369
Guido van Rossumd8faa362007-04-27 19:54:29 +00001370 def _block(self, count):
1371 """Round up a byte count by BLOCKSIZE and return it,
1372 e.g. _block(834) => 1024.
1373 """
1374 blocks, remainder = divmod(count, BLOCKSIZE)
1375 if remainder:
1376 blocks += 1
1377 return blocks * BLOCKSIZE
Thomas Wouters89f507f2006-12-13 04:49:30 +00001378
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001379 def isreg(self):
1380 return self.type in REGULAR_TYPES
1381 def isfile(self):
1382 return self.isreg()
1383 def isdir(self):
1384 return self.type == DIRTYPE
1385 def issym(self):
1386 return self.type == SYMTYPE
1387 def islnk(self):
1388 return self.type == LNKTYPE
1389 def ischr(self):
1390 return self.type == CHRTYPE
1391 def isblk(self):
1392 return self.type == BLKTYPE
1393 def isfifo(self):
1394 return self.type == FIFOTYPE
1395 def issparse(self):
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001396 return self.sparse is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001397 def isdev(self):
1398 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1399# class TarInfo
1400
1401class TarFile(object):
1402 """The TarFile Class provides an interface to tar archives.
1403 """
1404
1405 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1406
1407 dereference = False # If true, add content of linked file to the
1408 # tar file, else the link.
1409
1410 ignore_zeros = False # If true, skips empty or invalid blocks and
1411 # continues processing.
1412
Lars Gustäbel365aff32009-12-13 11:42:29 +00001413 errorlevel = 1 # If 0, fatal errors only appear in debug
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001414 # messages (if debug >= 0). If > 0, errors
1415 # are passed to the caller as exceptions.
1416
Guido van Rossumd8faa362007-04-27 19:54:29 +00001417 format = DEFAULT_FORMAT # The format to use when creating an archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001418
Guido van Rossume7ba4952007-06-06 23:52:48 +00001419 encoding = ENCODING # Encoding for 8-bit character strings.
1420
1421 errors = None # Error handler for unicode conversion.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001422
Guido van Rossumd8faa362007-04-27 19:54:29 +00001423 tarinfo = TarInfo # The default TarInfo class to use.
1424
Lars Gustäbelb062a2f2012-05-14 13:18:16 +02001425 fileobject = ExFileObject # The file-object for extractfile().
Guido van Rossumd8faa362007-04-27 19:54:29 +00001426
1427 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1428 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
Victor Stinnerde629d42010-05-05 21:43:57 +00001429 errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001430 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1431 read from an existing archive, 'a' to append data to an existing
1432 file or 'w' to create a new file overwriting an existing one. `mode'
1433 defaults to 'r'.
1434 If `fileobj' is given, it is used for reading or writing data. If it
1435 can be determined, `mode' is overridden by `fileobj's mode.
1436 `fileobj' is not closed, when TarFile is closed.
1437 """
Serhiy Storchaka53ad0cd2014-01-18 15:35:37 +02001438 modes = {"r": "rb", "a": "r+b", "w": "wb"}
1439 if mode not in modes:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001440 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001441 self.mode = mode
Serhiy Storchaka53ad0cd2014-01-18 15:35:37 +02001442 self._mode = modes[mode]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001443
1444 if not fileobj:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001445 if self.mode == "a" and not os.path.exists(name):
Thomas Wouterscf297e42007-02-23 15:07:44 +00001446 # Create nonexistent files in append mode.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001447 self.mode = "w"
1448 self._mode = "wb"
Guido van Rossume7ba4952007-06-06 23:52:48 +00001449 fileobj = bltn_open(name, self._mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001450 self._extfileobj = False
1451 else:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001452 if name is None and hasattr(fileobj, "name"):
1453 name = fileobj.name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001454 if hasattr(fileobj, "mode"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001455 self._mode = fileobj.mode
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001456 self._extfileobj = True
Thomas Woutersed03b412007-08-28 21:37:11 +00001457 self.name = os.path.abspath(name) if name else None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001458 self.fileobj = fileobj
1459
Guido van Rossumd8faa362007-04-27 19:54:29 +00001460 # Init attributes.
1461 if format is not None:
1462 self.format = format
1463 if tarinfo is not None:
1464 self.tarinfo = tarinfo
1465 if dereference is not None:
1466 self.dereference = dereference
1467 if ignore_zeros is not None:
1468 self.ignore_zeros = ignore_zeros
1469 if encoding is not None:
1470 self.encoding = encoding
Victor Stinnerde629d42010-05-05 21:43:57 +00001471 self.errors = errors
Guido van Rossume7ba4952007-06-06 23:52:48 +00001472
1473 if pax_headers is not None and self.format == PAX_FORMAT:
1474 self.pax_headers = pax_headers
1475 else:
1476 self.pax_headers = {}
1477
Guido van Rossumd8faa362007-04-27 19:54:29 +00001478 if debug is not None:
1479 self.debug = debug
1480 if errorlevel is not None:
1481 self.errorlevel = errorlevel
1482
1483 # Init datastructures.
Thomas Wouters477c8d52006-05-27 19:21:47 +00001484 self.closed = False
1485 self.members = [] # list of members as TarInfo objects
1486 self._loaded = False # flag if all members have been read
Christian Heimesd8654cf2007-12-02 15:22:16 +00001487 self.offset = self.fileobj.tell()
1488 # current position in the archive file
Thomas Wouters477c8d52006-05-27 19:21:47 +00001489 self.inodes = {} # dictionary caching the inodes of
1490 # archive members already added
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001491
Lars Gustäbel7b465392009-11-18 20:29:25 +00001492 try:
1493 if self.mode == "r":
1494 self.firstmember = None
1495 self.firstmember = self.next()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001496
Lars Gustäbel7b465392009-11-18 20:29:25 +00001497 if self.mode == "a":
1498 # Move to the end of the archive,
1499 # before the first empty block.
Lars Gustäbel7b465392009-11-18 20:29:25 +00001500 while True:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001501 self.fileobj.seek(self.offset)
1502 try:
1503 tarinfo = self.tarinfo.fromtarfile(self)
1504 self.members.append(tarinfo)
1505 except EOFHeaderError:
1506 self.fileobj.seek(self.offset)
Lars Gustäbel7b465392009-11-18 20:29:25 +00001507 break
Lars Gustäbel9520a432009-11-22 18:48:49 +00001508 except HeaderError as e:
1509 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001510
Lars Gustäbel7b465392009-11-18 20:29:25 +00001511 if self.mode in "aw":
1512 self._loaded = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001513
Lars Gustäbel7b465392009-11-18 20:29:25 +00001514 if self.pax_headers:
1515 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1516 self.fileobj.write(buf)
1517 self.offset += len(buf)
1518 except:
1519 if not self._extfileobj:
1520 self.fileobj.close()
1521 self.closed = True
1522 raise
Guido van Rossumd8faa362007-04-27 19:54:29 +00001523
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001524 #--------------------------------------------------------------------------
1525 # Below are the classmethods which act as alternate constructors to the
1526 # TarFile class. The open() method is the only one that is needed for
1527 # public use; it is the "super"-constructor and is able to select an
1528 # adequate "sub"-constructor for a particular compression using the mapping
1529 # from OPEN_METH.
1530 #
1531 # This concept allows one to subclass TarFile without losing the comfort of
1532 # the super-constructor. A sub-constructor is registered and made available
1533 # by adding it to the mapping in OPEN_METH.
1534
Guido van Rossum75b64e62005-01-16 00:16:11 +00001535 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001536 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001537 """Open a tar archive for reading, writing or appending. Return
1538 an appropriate TarFile class.
1539
1540 mode:
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001541 'r' or 'r:*' open for reading with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001542 'r:' open for reading exclusively uncompressed
1543 'r:gz' open for reading with gzip compression
1544 'r:bz2' open for reading with bzip2 compression
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001545 'r:xz' open for reading with lzma compression
Thomas Wouterscf297e42007-02-23 15:07:44 +00001546 'a' or 'a:' open for appending, creating the file if necessary
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001547 'w' or 'w:' open for writing without compression
1548 'w:gz' open for writing with gzip compression
1549 'w:bz2' open for writing with bzip2 compression
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001550 'w:xz' open for writing with lzma compression
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001551
1552 'r|*' open a stream of tar blocks with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001553 'r|' open an uncompressed stream of tar blocks for reading
1554 'r|gz' open a gzip compressed stream of tar blocks
1555 'r|bz2' open a bzip2 compressed stream of tar blocks
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001556 'r|xz' open an lzma compressed stream of tar blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001557 'w|' open an uncompressed stream for writing
1558 'w|gz' open a gzip compressed stream for writing
1559 'w|bz2' open a bzip2 compressed stream for writing
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001560 'w|xz' open an lzma compressed stream for writing
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001561 """
1562
1563 if not name and not fileobj:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001564 raise ValueError("nothing to open")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001565
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001566 if mode in ("r", "r:*"):
1567 # Find out which *open() is appropriate for opening the file.
1568 for comptype in cls.OPEN_METH:
1569 func = getattr(cls, cls.OPEN_METH[comptype])
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001570 if fileobj is not None:
1571 saved_pos = fileobj.tell()
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001572 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001573 return func(name, "r", fileobj, **kwargs)
1574 except (ReadError, CompressionError) as e:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001575 if fileobj is not None:
1576 fileobj.seek(saved_pos)
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001577 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001578 raise ReadError("file could not be opened successfully")
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001579
1580 elif ":" in mode:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001581 filemode, comptype = mode.split(":", 1)
1582 filemode = filemode or "r"
1583 comptype = comptype or "tar"
1584
1585 # Select the *open() function according to
1586 # given compression.
1587 if comptype in cls.OPEN_METH:
1588 func = getattr(cls, cls.OPEN_METH[comptype])
1589 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001590 raise CompressionError("unknown compression type %r" % comptype)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001591 return func(name, filemode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001592
1593 elif "|" in mode:
1594 filemode, comptype = mode.split("|", 1)
1595 filemode = filemode or "r"
1596 comptype = comptype or "tar"
1597
Serhiy Storchaka53ad0cd2014-01-18 15:35:37 +02001598 if filemode not in ("r", "w"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001599 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001600
Antoine Pitrou605c2932010-09-23 20:15:14 +00001601 stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1602 try:
1603 t = cls(name, filemode, stream, **kwargs)
1604 except:
1605 stream.close()
1606 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001607 t._extfileobj = False
1608 return t
1609
Serhiy Storchaka53ad0cd2014-01-18 15:35:37 +02001610 elif mode in ("a", "w"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001611 return cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001612
Thomas Wouters477c8d52006-05-27 19:21:47 +00001613 raise ValueError("undiscernible mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001614
Guido van Rossum75b64e62005-01-16 00:16:11 +00001615 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001616 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001617 """Open uncompressed tar archive name for reading or writing.
1618 """
Serhiy Storchaka53ad0cd2014-01-18 15:35:37 +02001619 if mode not in ("r", "a", "w"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001620 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001621 return cls(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001622
Guido van Rossum75b64e62005-01-16 00:16:11 +00001623 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001624 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001625 """Open gzip compressed tar archive name for reading or writing.
1626 Appending is not allowed.
1627 """
Serhiy Storchaka53ad0cd2014-01-18 15:35:37 +02001628 if mode not in ("r", "w"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001629 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001630
1631 try:
1632 import gzip
Neal Norwitz4ec68242003-04-11 03:05:56 +00001633 gzip.GzipFile
1634 except (ImportError, AttributeError):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001635 raise CompressionError("gzip module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001636
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001637 try:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001638 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001639 except OSError:
1640 if fileobj is not None and mode == 'r':
1641 raise ReadError("not a gzip file")
1642 raise
1643
1644 try:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001645 t = cls.taropen(name, mode, fileobj, **kwargs)
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001646 except OSError:
1647 fileobj.close()
1648 if mode == 'r':
1649 raise ReadError("not a gzip file")
1650 raise
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001651 except:
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001652 fileobj.close()
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001653 raise
Serhiy Storchaka9fbec7a2014-01-18 15:53:05 +02001654 t._extfileobj = False
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001655 return t
1656
Guido van Rossum75b64e62005-01-16 00:16:11 +00001657 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001658 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001659 """Open bzip2 compressed tar archive name for reading or writing.
1660 Appending is not allowed.
1661 """
Serhiy Storchaka53ad0cd2014-01-18 15:35:37 +02001662 if mode not in ("r", "w"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001663 raise ValueError("mode must be 'r' or 'w'.")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001664
1665 try:
1666 import bz2
1667 except ImportError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001668 raise CompressionError("bz2 module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001669
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +02001670 fileobj = bz2.BZ2File(fileobj or name, mode,
1671 compresslevel=compresslevel)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001672
1673 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001674 t = cls.taropen(name, mode, fileobj, **kwargs)
Lars Gustäbel9520a432009-11-22 18:48:49 +00001675 except (IOError, EOFError):
Antoine Pitrou95f55602010-09-23 18:36:46 +00001676 fileobj.close()
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001677 if mode == 'r':
1678 raise ReadError("not a bzip2 file")
1679 raise
Serhiy Storchakae413cde2014-01-18 16:28:08 +02001680 except:
1681 fileobj.close()
1682 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001683 t._extfileobj = False
1684 return t
1685
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001686 @classmethod
Lars Gustäbelc5e11992012-01-18 14:01:17 +01001687 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001688 """Open lzma compressed tar archive name for reading or writing.
1689 Appending is not allowed.
1690 """
1691 if mode not in ("r", "w"):
1692 raise ValueError("mode must be 'r' or 'w'")
1693
1694 try:
1695 import lzma
1696 except ImportError:
1697 raise CompressionError("lzma module is not available")
1698
Nadeem Vawda33c34da2012-06-04 23:34:07 +02001699 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001700
1701 try:
1702 t = cls.taropen(name, mode, fileobj, **kwargs)
1703 except (lzma.LZMAError, EOFError):
1704 fileobj.close()
Serhiy Storchakac2d01422014-01-18 16:14:10 +02001705 if mode == 'r':
1706 raise ReadError("not an lzma file")
1707 raise
Serhiy Storchakae413cde2014-01-18 16:28:08 +02001708 except:
1709 fileobj.close()
1710 raise
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001711 t._extfileobj = False
1712 return t
1713
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001714 # All *open() methods are registered here.
1715 OPEN_METH = {
1716 "tar": "taropen", # uncompressed tar
1717 "gz": "gzopen", # gzip compressed tar
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001718 "bz2": "bz2open", # bzip2 compressed tar
1719 "xz": "xzopen" # lzma compressed tar
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001720 }
1721
1722 #--------------------------------------------------------------------------
1723 # The public methods which TarFile provides:
1724
1725 def close(self):
1726 """Close the TarFile. In write-mode, two finishing zero blocks are
1727 appended to the archive.
1728 """
1729 if self.closed:
1730 return
1731
Guido van Rossumd8faa362007-04-27 19:54:29 +00001732 if self.mode in "aw":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001733 self.fileobj.write(NUL * (BLOCKSIZE * 2))
1734 self.offset += (BLOCKSIZE * 2)
1735 # fill up the end with zero-blocks
1736 # (like option -b20 for tar does)
1737 blocks, remainder = divmod(self.offset, RECORDSIZE)
1738 if remainder > 0:
1739 self.fileobj.write(NUL * (RECORDSIZE - remainder))
1740
1741 if not self._extfileobj:
1742 self.fileobj.close()
1743 self.closed = True
1744
1745 def getmember(self, name):
1746 """Return a TarInfo object for member `name'. If `name' can not be
1747 found in the archive, KeyError is raised. If a member occurs more
Mark Dickinson934896d2009-02-21 20:59:32 +00001748 than once in the archive, its last occurrence is assumed to be the
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001749 most up-to-date version.
1750 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001751 tarinfo = self._getmember(name)
1752 if tarinfo is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001753 raise KeyError("filename %r not found" % name)
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001754 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001755
1756 def getmembers(self):
1757 """Return the members of the archive as a list of TarInfo objects. The
1758 list has the same order as the members in the archive.
1759 """
1760 self._check()
1761 if not self._loaded: # if we want to obtain a list of
1762 self._load() # all members, we first have to
1763 # scan the whole archive.
1764 return self.members
1765
1766 def getnames(self):
1767 """Return the members of the archive as a list of their names. It has
1768 the same order as the list returned by getmembers().
1769 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001770 return [tarinfo.name for tarinfo in self.getmembers()]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001771
1772 def gettarinfo(self, name=None, arcname=None, fileobj=None):
1773 """Create a TarInfo object for either the file `name' or the file
1774 object `fileobj' (using os.fstat on its file descriptor). You can
1775 modify some of the TarInfo's attributes before you add it using
1776 addfile(). If given, `arcname' specifies an alternative name for the
1777 file in the archive.
1778 """
1779 self._check("aw")
1780
1781 # When fileobj is given, replace name by
1782 # fileobj's real name.
1783 if fileobj is not None:
1784 name = fileobj.name
1785
1786 # Building the name of the member in the archive.
1787 # Backward slashes are converted to forward slashes,
1788 # Absolute paths are turned to relative paths.
1789 if arcname is None:
1790 arcname = name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001791 drv, arcname = os.path.splitdrive(arcname)
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00001792 arcname = arcname.replace(os.sep, "/")
1793 arcname = arcname.lstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001794
1795 # Now, fill the TarInfo object with
1796 # information specific for the file.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001797 tarinfo = self.tarinfo()
1798 tarinfo.tarfile = self
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001799
1800 # Use os.stat or os.lstat, depending on platform
1801 # and if symlinks shall be resolved.
1802 if fileobj is None:
1803 if hasattr(os, "lstat") and not self.dereference:
1804 statres = os.lstat(name)
1805 else:
1806 statres = os.stat(name)
1807 else:
1808 statres = os.fstat(fileobj.fileno())
1809 linkname = ""
1810
1811 stmd = statres.st_mode
1812 if stat.S_ISREG(stmd):
1813 inode = (statres.st_ino, statres.st_dev)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001814 if not self.dereference and statres.st_nlink > 1 and \
1815 inode in self.inodes and arcname != self.inodes[inode]:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001816 # Is it a hardlink to an already
1817 # archived file?
1818 type = LNKTYPE
1819 linkname = self.inodes[inode]
1820 else:
1821 # The inode is added only if its valid.
1822 # For win32 it is always 0.
1823 type = REGTYPE
1824 if inode[0]:
1825 self.inodes[inode] = arcname
1826 elif stat.S_ISDIR(stmd):
1827 type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001828 elif stat.S_ISFIFO(stmd):
1829 type = FIFOTYPE
1830 elif stat.S_ISLNK(stmd):
1831 type = SYMTYPE
1832 linkname = os.readlink(name)
1833 elif stat.S_ISCHR(stmd):
1834 type = CHRTYPE
1835 elif stat.S_ISBLK(stmd):
1836 type = BLKTYPE
1837 else:
1838 return None
1839
1840 # Fill the TarInfo object with all
1841 # information we can get.
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001842 tarinfo.name = arcname
1843 tarinfo.mode = stmd
1844 tarinfo.uid = statres.st_uid
1845 tarinfo.gid = statres.st_gid
Lars Gustäbel2470ff12010-06-03 10:11:52 +00001846 if type == REGTYPE:
Martin v. Löwis61d77e02004-08-20 06:35:46 +00001847 tarinfo.size = statres.st_size
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001848 else:
Guido van Rossume2a383d2007-01-15 16:59:06 +00001849 tarinfo.size = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001850 tarinfo.mtime = statres.st_mtime
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001851 tarinfo.type = type
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001852 tarinfo.linkname = linkname
1853 if pwd:
1854 try:
1855 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1856 except KeyError:
1857 pass
1858 if grp:
1859 try:
1860 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1861 except KeyError:
1862 pass
1863
1864 if type in (CHRTYPE, BLKTYPE):
1865 if hasattr(os, "major") and hasattr(os, "minor"):
1866 tarinfo.devmajor = os.major(statres.st_rdev)
1867 tarinfo.devminor = os.minor(statres.st_rdev)
1868 return tarinfo
1869
1870 def list(self, verbose=True):
1871 """Print a table of contents to sys.stdout. If `verbose' is False, only
1872 the names of the members are printed. If it is True, an `ls -l'-like
1873 output is produced.
1874 """
1875 self._check()
1876
1877 for tarinfo in self:
1878 if verbose:
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001879 _safe_print(stat.filemode(tarinfo.mode))
1880 _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid,
1881 tarinfo.gname or tarinfo.gid))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001882 if tarinfo.ischr() or tarinfo.isblk():
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001883 _safe_print("%10s" %
1884 ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor)))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001885 else:
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001886 _safe_print("%10d" % tarinfo.size)
1887 _safe_print("%d-%02d-%02d %02d:%02d:%02d" \
1888 % time.localtime(tarinfo.mtime)[:6])
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001889
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001890 _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else ""))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001891
1892 if verbose:
1893 if tarinfo.issym():
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001894 _safe_print("-> " + tarinfo.linkname)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001895 if tarinfo.islnk():
Serhiy Storchaka3b4f1592014-02-05 20:53:36 +02001896 _safe_print("link to " + tarinfo.linkname)
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001897 print()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001898
Raymond Hettingera63a3122011-01-26 20:34:14 +00001899 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001900 """Add the file `name' to the archive. `name' may be any type of file
1901 (directory, fifo, symbolic link, etc.). If given, `arcname'
1902 specifies an alternative name for the file in the archive.
1903 Directories are added recursively by default. This can be avoided by
Guido van Rossum486364b2007-06-30 05:01:58 +00001904 setting `recursive' to False. `exclude' is a function that should
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001905 return True for each filename to be excluded. `filter' is a function
1906 that expects a TarInfo object argument and returns the changed
1907 TarInfo object, if it returns None the TarInfo object will be
1908 excluded from the archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001909 """
1910 self._check("aw")
1911
1912 if arcname is None:
1913 arcname = name
1914
Guido van Rossum486364b2007-06-30 05:01:58 +00001915 # Exclude pathnames.
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001916 if exclude is not None:
1917 import warnings
1918 warnings.warn("use the filter argument instead",
1919 DeprecationWarning, 2)
1920 if exclude(name):
1921 self._dbg(2, "tarfile: Excluded %r" % name)
1922 return
Guido van Rossum486364b2007-06-30 05:01:58 +00001923
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001924 # Skip if somebody tries to archive the archive...
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001925 if self.name is not None and os.path.abspath(name) == self.name:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001926 self._dbg(2, "tarfile: Skipped %r" % name)
1927 return
1928
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001929 self._dbg(1, name)
1930
1931 # Create a TarInfo object from the file.
1932 tarinfo = self.gettarinfo(name, arcname)
1933
1934 if tarinfo is None:
1935 self._dbg(1, "tarfile: Unsupported type %r" % name)
1936 return
1937
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001938 # Change or exclude the TarInfo object.
1939 if filter is not None:
1940 tarinfo = filter(tarinfo)
1941 if tarinfo is None:
1942 self._dbg(2, "tarfile: Excluded %r" % name)
1943 return
1944
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001945 # Append the tar header and data to the archive.
1946 if tarinfo.isreg():
Andrew Svetlov718df1d2012-11-29 14:20:47 +02001947 with bltn_open(name, "rb") as f:
1948 self.addfile(tarinfo, f)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001949
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001950 elif tarinfo.isdir():
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001951 self.addfile(tarinfo)
1952 if recursive:
1953 for f in os.listdir(name):
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001954 self.add(os.path.join(name, f), os.path.join(arcname, f),
Raymond Hettingera63a3122011-01-26 20:34:14 +00001955 recursive, exclude, filter=filter)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001956
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001957 else:
1958 self.addfile(tarinfo)
1959
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001960 def addfile(self, tarinfo, fileobj=None):
1961 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
1962 given, tarinfo.size bytes are read from it and added to the archive.
1963 You can create TarInfo objects using gettarinfo().
1964 On Windows platforms, `fileobj' should always be opened with mode
1965 'rb' to avoid irritation about the file size.
1966 """
1967 self._check("aw")
1968
Thomas Wouters89f507f2006-12-13 04:49:30 +00001969 tarinfo = copy.copy(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001970
Guido van Rossume7ba4952007-06-06 23:52:48 +00001971 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001972 self.fileobj.write(buf)
1973 self.offset += len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001974
1975 # If there's data to follow, append it.
1976 if fileobj is not None:
1977 copyfileobj(fileobj, self.fileobj, tarinfo.size)
1978 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
1979 if remainder > 0:
1980 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
1981 blocks += 1
1982 self.offset += blocks * BLOCKSIZE
1983
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001984 self.members.append(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001985
Martin v. Löwis00a73e72005-03-04 19:40:34 +00001986 def extractall(self, path=".", members=None):
1987 """Extract all members from the archive to the current working
1988 directory and set owner, modification time and permissions on
1989 directories afterwards. `path' specifies a different directory
1990 to extract to. `members' is optional and must be a subset of the
1991 list returned by getmembers().
1992 """
1993 directories = []
1994
1995 if members is None:
1996 members = self
1997
1998 for tarinfo in members:
1999 if tarinfo.isdir():
Christian Heimes2202f872008-02-06 14:31:34 +00002000 # Extract directories with a safe mode.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002001 directories.append(tarinfo)
Christian Heimes2202f872008-02-06 14:31:34 +00002002 tarinfo = copy.copy(tarinfo)
2003 tarinfo.mode = 0o700
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002004 # Do not set_attrs directories, as we will do that further down
2005 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002006
2007 # Reverse sort directories.
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +00002008 directories.sort(key=lambda a: a.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002009 directories.reverse()
2010
2011 # Set correct owner, mtime and filemode on directories.
2012 for tarinfo in directories:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002013 dirpath = os.path.join(path, tarinfo.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002014 try:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002015 self.chown(tarinfo, dirpath)
2016 self.utime(tarinfo, dirpath)
2017 self.chmod(tarinfo, dirpath)
Guido van Rossumb940e112007-01-10 16:19:56 +00002018 except ExtractError as e:
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002019 if self.errorlevel > 1:
2020 raise
2021 else:
2022 self._dbg(1, "tarfile: %s" % e)
2023
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002024 def extract(self, member, path="", set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002025 """Extract a member from the archive to the current working directory,
2026 using its full name. Its file information is extracted as accurately
2027 as possible. `member' may be a filename or a TarInfo object. You can
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002028 specify a different directory using `path'. File attributes (owner,
2029 mtime, mode) are set unless `set_attrs' is False.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002030 """
2031 self._check("r")
2032
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002033 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002034 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002035 else:
2036 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002037
Neal Norwitza4f651a2004-07-20 22:07:44 +00002038 # Prepare the link target for makelink().
2039 if tarinfo.islnk():
2040 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2041
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002042 try:
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002043 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2044 set_attrs=set_attrs)
Guido van Rossumb940e112007-01-10 16:19:56 +00002045 except EnvironmentError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002046 if self.errorlevel > 0:
2047 raise
2048 else:
2049 if e.filename is None:
2050 self._dbg(1, "tarfile: %s" % e.strerror)
2051 else:
2052 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
Guido van Rossumb940e112007-01-10 16:19:56 +00002053 except ExtractError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002054 if self.errorlevel > 1:
2055 raise
2056 else:
2057 self._dbg(1, "tarfile: %s" % e)
2058
2059 def extractfile(self, member):
2060 """Extract a member from the archive as a file object. `member' may be
Lars Gustäbel7a919e92012-05-05 18:15:03 +02002061 a filename or a TarInfo object. If `member' is a regular file or a
2062 link, an io.BufferedReader object is returned. Otherwise, None is
2063 returned.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002064 """
2065 self._check("r")
2066
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002067 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002068 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002069 else:
2070 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002071
Lars Gustäbel7a919e92012-05-05 18:15:03 +02002072 if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES:
2073 # Members with unknown types are treated as regular files.
Lars Gustäbelb062a2f2012-05-14 13:18:16 +02002074 return self.fileobject(self, tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002075
2076 elif tarinfo.islnk() or tarinfo.issym():
2077 if isinstance(self.fileobj, _Stream):
2078 # A small but ugly workaround for the case that someone tries
2079 # to extract a (sym)link as a file-object from a non-seekable
2080 # stream of tar blocks.
Thomas Wouters477c8d52006-05-27 19:21:47 +00002081 raise StreamError("cannot extract (sym)link as file object")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002082 else:
Georg Brandl7eb4b7d2005-07-22 21:49:32 +00002083 # A (sym)link's file object is its target's file object.
Lars Gustäbel1b512722010-06-03 12:45:16 +00002084 return self.extractfile(self._find_link_target(tarinfo))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002085 else:
2086 # If there's no data associated with the member (directory, chrdev,
2087 # blkdev, etc.), return None instead of a file object.
2088 return None
2089
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002090 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002091 """Extract the TarInfo object tarinfo to a physical
2092 file called targetpath.
2093 """
2094 # Fetch the TarInfo object for the given name
2095 # and build the destination pathname, replacing
2096 # forward slashes to platform specific separators.
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002097 targetpath = targetpath.rstrip("/")
2098 targetpath = targetpath.replace("/", os.sep)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002099
2100 # Create all upper directories.
2101 upperdirs = os.path.dirname(targetpath)
2102 if upperdirs and not os.path.exists(upperdirs):
Christian Heimes2202f872008-02-06 14:31:34 +00002103 # Create directories that are not part of the archive with
2104 # default permissions.
Thomas Woutersb2137042007-02-01 18:02:27 +00002105 os.makedirs(upperdirs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002106
2107 if tarinfo.islnk() or tarinfo.issym():
2108 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2109 else:
2110 self._dbg(1, tarinfo.name)
2111
2112 if tarinfo.isreg():
2113 self.makefile(tarinfo, targetpath)
2114 elif tarinfo.isdir():
2115 self.makedir(tarinfo, targetpath)
2116 elif tarinfo.isfifo():
2117 self.makefifo(tarinfo, targetpath)
2118 elif tarinfo.ischr() or tarinfo.isblk():
2119 self.makedev(tarinfo, targetpath)
2120 elif tarinfo.islnk() or tarinfo.issym():
2121 self.makelink(tarinfo, targetpath)
2122 elif tarinfo.type not in SUPPORTED_TYPES:
2123 self.makeunknown(tarinfo, targetpath)
2124 else:
2125 self.makefile(tarinfo, targetpath)
2126
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002127 if set_attrs:
2128 self.chown(tarinfo, targetpath)
2129 if not tarinfo.issym():
2130 self.chmod(tarinfo, targetpath)
2131 self.utime(tarinfo, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002132
2133 #--------------------------------------------------------------------------
2134 # Below are the different file methods. They are called via
2135 # _extract_member() when extract() is called. They can be replaced in a
2136 # subclass to implement other functionality.
2137
2138 def makedir(self, tarinfo, targetpath):
2139 """Make a directory called targetpath.
2140 """
2141 try:
Christian Heimes2202f872008-02-06 14:31:34 +00002142 # Use a safe mode for the directory, the real mode is set
2143 # later in _extract_member().
2144 os.mkdir(targetpath, 0o700)
Florent Xicluna68f71a32011-10-28 16:06:23 +02002145 except FileExistsError:
2146 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002147
2148 def makefile(self, tarinfo, targetpath):
2149 """Make a file called targetpath.
2150 """
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002151 source = self.fileobj
2152 source.seek(tarinfo.offset_data)
Andrew Svetlov718df1d2012-11-29 14:20:47 +02002153 with bltn_open(targetpath, "wb") as target:
2154 if tarinfo.sparse is not None:
2155 for offset, size in tarinfo.sparse:
2156 target.seek(offset)
2157 copyfileobj(source, target, size)
2158 else:
2159 copyfileobj(source, target, tarinfo.size)
2160 target.seek(tarinfo.size)
2161 target.truncate()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002162
2163 def makeunknown(self, tarinfo, targetpath):
2164 """Make a file from a TarInfo object with an unknown type
2165 at targetpath.
2166 """
2167 self.makefile(tarinfo, targetpath)
2168 self._dbg(1, "tarfile: Unknown file type %r, " \
2169 "extracted as regular file." % tarinfo.type)
2170
2171 def makefifo(self, tarinfo, targetpath):
2172 """Make a fifo called targetpath.
2173 """
2174 if hasattr(os, "mkfifo"):
2175 os.mkfifo(targetpath)
2176 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002177 raise ExtractError("fifo not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002178
2179 def makedev(self, tarinfo, targetpath):
2180 """Make a character or block device called targetpath.
2181 """
2182 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00002183 raise ExtractError("special devices not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002184
2185 mode = tarinfo.mode
2186 if tarinfo.isblk():
2187 mode |= stat.S_IFBLK
2188 else:
2189 mode |= stat.S_IFCHR
2190
2191 os.mknod(targetpath, mode,
2192 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2193
2194 def makelink(self, tarinfo, targetpath):
2195 """Make a (symbolic) link called targetpath. If it cannot be created
2196 (platform limitation), we try to make a copy of the referenced file
2197 instead of a link.
2198 """
Brian Curtind40e6f72010-07-08 21:39:08 +00002199 try:
Lars Gustäbel1b512722010-06-03 12:45:16 +00002200 # For systems that support symbolic and hard links.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002201 if tarinfo.issym():
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002202 os.symlink(tarinfo.linkname, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002203 else:
Neal Norwitza4f651a2004-07-20 22:07:44 +00002204 # See extract().
Lars Gustäbel1b512722010-06-03 12:45:16 +00002205 if os.path.exists(tarinfo._link_target):
2206 os.link(tarinfo._link_target, targetpath)
2207 else:
Brian Curtin82df53e2010-09-24 21:04:05 +00002208 self._extract_member(self._find_link_target(tarinfo),
2209 targetpath)
Brian Curtin16633fa2010-07-09 13:54:27 +00002210 except symlink_exception:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002211 try:
Brian Curtin16633fa2010-07-09 13:54:27 +00002212 self._extract_member(self._find_link_target(tarinfo),
2213 targetpath)
Lars Gustäbel1b512722010-06-03 12:45:16 +00002214 except KeyError:
2215 raise ExtractError("unable to resolve link inside archive")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002216
2217 def chown(self, tarinfo, targetpath):
2218 """Set owner of targetpath according to tarinfo.
2219 """
2220 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2221 # We have to be root to do so.
2222 try:
2223 g = grp.getgrnam(tarinfo.gname)[2]
2224 except KeyError:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002225 g = tarinfo.gid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002226 try:
2227 u = pwd.getpwnam(tarinfo.uname)[2]
2228 except KeyError:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002229 u = tarinfo.uid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002230 try:
2231 if tarinfo.issym() and hasattr(os, "lchown"):
2232 os.lchown(targetpath, u, g)
2233 else:
Andrew MacIntyre7970d202003-02-19 12:51:34 +00002234 if sys.platform != "os2emx":
2235 os.chown(targetpath, u, g)
Guido van Rossumb940e112007-01-10 16:19:56 +00002236 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002237 raise ExtractError("could not change owner")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002238
2239 def chmod(self, tarinfo, targetpath):
2240 """Set file permissions of targetpath according to tarinfo.
2241 """
Jack Jansen834eff62003-03-07 12:47:06 +00002242 if hasattr(os, 'chmod'):
2243 try:
2244 os.chmod(targetpath, tarinfo.mode)
Guido van Rossumb940e112007-01-10 16:19:56 +00002245 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002246 raise ExtractError("could not change mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002247
2248 def utime(self, tarinfo, targetpath):
2249 """Set modification time of targetpath according to tarinfo.
2250 """
Jack Jansen834eff62003-03-07 12:47:06 +00002251 if not hasattr(os, 'utime'):
Tim Petersf9347782003-03-07 15:36:41 +00002252 return
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002253 try:
2254 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
Guido van Rossumb940e112007-01-10 16:19:56 +00002255 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002256 raise ExtractError("could not change modification time")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002257
2258 #--------------------------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002259 def next(self):
2260 """Return the next member of the archive as a TarInfo object, when
2261 TarFile is opened for reading. Return None if there is no more
2262 available.
2263 """
2264 self._check("ra")
2265 if self.firstmember is not None:
2266 m = self.firstmember
2267 self.firstmember = None
2268 return m
2269
2270 # Read the next block.
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002271 self.fileobj.seek(self.offset)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002272 tarinfo = None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002273 while True:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002274 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00002275 tarinfo = self.tarinfo.fromtarfile(self)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002276 except EOFHeaderError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002277 if self.ignore_zeros:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002278 self._dbg(2, "0x%X: %s" % (self.offset, e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002279 self.offset += BLOCKSIZE
2280 continue
Lars Gustäbel9520a432009-11-22 18:48:49 +00002281 except InvalidHeaderError as e:
2282 if self.ignore_zeros:
2283 self._dbg(2, "0x%X: %s" % (self.offset, e))
2284 self.offset += BLOCKSIZE
2285 continue
2286 elif self.offset == 0:
2287 raise ReadError(str(e))
2288 except EmptyHeaderError:
2289 if self.offset == 0:
2290 raise ReadError("empty file")
2291 except TruncatedHeaderError as e:
2292 if self.offset == 0:
2293 raise ReadError(str(e))
2294 except SubsequentHeaderError as e:
2295 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002296 break
2297
Lars Gustäbel9520a432009-11-22 18:48:49 +00002298 if tarinfo is not None:
2299 self.members.append(tarinfo)
2300 else:
2301 self._loaded = True
2302
Thomas Wouters477c8d52006-05-27 19:21:47 +00002303 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002304
2305 #--------------------------------------------------------------------------
2306 # Little helper methods:
2307
Lars Gustäbel1b512722010-06-03 12:45:16 +00002308 def _getmember(self, name, tarinfo=None, normalize=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002309 """Find an archive member by name from bottom to top.
2310 If tarinfo is given, it is used as the starting point.
2311 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002312 # Ensure that all members have been loaded.
2313 members = self.getmembers()
2314
Lars Gustäbel1b512722010-06-03 12:45:16 +00002315 # Limit the member search list up to tarinfo.
2316 if tarinfo is not None:
2317 members = members[:members.index(tarinfo)]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002318
Lars Gustäbel1b512722010-06-03 12:45:16 +00002319 if normalize:
2320 name = os.path.normpath(name)
2321
2322 for member in reversed(members):
2323 if normalize:
2324 member_name = os.path.normpath(member.name)
2325 else:
2326 member_name = member.name
2327
2328 if name == member_name:
2329 return member
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002330
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002331 def _load(self):
2332 """Read through the entire archive file and look for readable
2333 members.
2334 """
2335 while True:
2336 tarinfo = self.next()
2337 if tarinfo is None:
2338 break
2339 self._loaded = True
2340
2341 def _check(self, mode=None):
2342 """Check if TarFile is still open, and if the operation's mode
2343 corresponds to TarFile's mode.
2344 """
2345 if self.closed:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002346 raise IOError("%s is closed" % self.__class__.__name__)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002347 if mode is not None and self.mode not in mode:
2348 raise IOError("bad operation for mode %r" % self.mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002349
Lars Gustäbel1b512722010-06-03 12:45:16 +00002350 def _find_link_target(self, tarinfo):
2351 """Find the target member of a symlink or hardlink member in the
2352 archive.
2353 """
2354 if tarinfo.issym():
2355 # Always search the entire archive.
Lars Gustäbel1ef9eda2012-04-24 21:04:40 +02002356 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
Lars Gustäbel1b512722010-06-03 12:45:16 +00002357 limit = None
2358 else:
2359 # Search the archive before the link, because a hard link is
2360 # just a reference to an already archived file.
2361 linkname = tarinfo.linkname
2362 limit = tarinfo
2363
2364 member = self._getmember(linkname, tarinfo=limit, normalize=True)
2365 if member is None:
2366 raise KeyError("linkname %r not found" % linkname)
2367 return member
2368
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002369 def __iter__(self):
2370 """Provide an iterator object.
2371 """
2372 if self._loaded:
2373 return iter(self.members)
2374 else:
2375 return TarIter(self)
2376
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002377 def _dbg(self, level, msg):
2378 """Write debugging output to sys.stderr.
2379 """
2380 if level <= self.debug:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002381 print(msg, file=sys.stderr)
Lars Gustäbel01385812010-03-03 12:08:54 +00002382
2383 def __enter__(self):
2384 self._check()
2385 return self
2386
2387 def __exit__(self, type, value, traceback):
2388 if type is None:
2389 self.close()
2390 else:
2391 # An exception occurred. We must not call close() because
2392 # it would try to write end-of-archive blocks and padding.
2393 if not self._extfileobj:
2394 self.fileobj.close()
2395 self.closed = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002396# class TarFile
2397
2398class TarIter:
2399 """Iterator Class.
2400
2401 for tarinfo in TarFile(...):
2402 suite...
2403 """
2404
2405 def __init__(self, tarfile):
2406 """Construct a TarIter object.
2407 """
2408 self.tarfile = tarfile
Martin v. Löwis637431b2005-03-03 23:12:42 +00002409 self.index = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002410 def __iter__(self):
2411 """Return iterator object.
2412 """
2413 return self
Georg Brandla18af4e2007-04-21 15:47:16 +00002414 def __next__(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002415 """Return the next item using TarFile's next() method.
2416 When all members have been read, set TarFile as _loaded.
2417 """
Martin v. Löwis637431b2005-03-03 23:12:42 +00002418 # Fix for SF #1100429: Under rare circumstances it can
2419 # happen that getmembers() is called during iteration,
2420 # which will cause TarIter to stop prematurely.
Serhiy Storchaka263fab92013-05-09 14:22:26 +03002421
2422 if self.index == 0 and self.tarfile.firstmember is not None:
2423 tarinfo = self.tarfile.next()
2424 elif self.index < len(self.tarfile.members):
2425 tarinfo = self.tarfile.members[self.index]
2426 elif not self.tarfile._loaded:
Martin v. Löwis637431b2005-03-03 23:12:42 +00002427 tarinfo = self.tarfile.next()
2428 if not tarinfo:
2429 self.tarfile._loaded = True
2430 raise StopIteration
2431 else:
Serhiy Storchaka263fab92013-05-09 14:22:26 +03002432 raise StopIteration
Martin v. Löwis637431b2005-03-03 23:12:42 +00002433 self.index += 1
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002434 return tarinfo
2435
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002436#--------------------
2437# exported functions
2438#--------------------
2439def is_tarfile(name):
2440 """Return True if name points to a tar archive that we
2441 are able to handle, else return False.
2442 """
2443 try:
2444 t = open(name)
2445 t.close()
2446 return True
2447 except TarError:
2448 return False
2449
Guido van Rossume7ba4952007-06-06 23:52:48 +00002450bltn_open = open
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002451open = TarFile.open