blob: e273787695e2cf8f037147b2ee3a56e15e16c2e9 [file] [log] [blame]
Benjamin Peterson90f5ba52010-03-11 22:53:45 +00001#!/usr/bin/env python3
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
Christian Heimes9c1257e2007-11-04 11:37:22 +00005# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00006# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
Guido van Rossumd8faa362007-04-27 19:54:29 +000032version = "0.9.0"
Guido van Rossum98297ee2007-11-06 21:34:58 +000033__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)"
Senthil Kumaran7c9719c2011-07-28 22:32:49 +080034__date__ = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $"
35__cvsid__ = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $"
Guido van Rossum98297ee2007-11-06 21:34:58 +000036__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000037
38#---------
39# Imports
40#---------
41import sys
42import os
Eli Bendersky74c503b2012-01-03 06:26:13 +020043import io
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000044import shutil
45import stat
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000046import time
47import struct
Thomas Wouters89f507f2006-12-13 04:49:30 +000048import copy
Guido van Rossumd8faa362007-04-27 19:54:29 +000049import re
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000050
51try:
52 import grp, pwd
53except ImportError:
54 grp = pwd = None
55
Brian Curtin16633fa2010-07-09 13:54:27 +000056# os.symlink on Windows prior to 6.0 raises NotImplementedError
57symlink_exception = (AttributeError, NotImplementedError)
58try:
59 # WindowsError (1314) will be raised if the caller does not hold the
60 # SeCreateSymbolicLinkPrivilege privilege
61 symlink_exception += (WindowsError,)
62except NameError:
63 pass
64
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000065# from tarfile import *
66__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
67
Georg Brandl1a3284e2007-12-02 09:40:06 +000068from builtins import open as _open # Since 'open' is TarFile.open
Guido van Rossum8f78fe92006-08-24 04:03:53 +000069
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000070#---------------------------------------------------------
71# tar constants
72#---------------------------------------------------------
Lars Gustäbelb506dc32007-08-07 18:36:16 +000073NUL = b"\0" # the null character
Guido van Rossumd8faa362007-04-27 19:54:29 +000074BLOCKSIZE = 512 # length of processing blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000075RECORDSIZE = BLOCKSIZE * 20 # length of records
Lars Gustäbelb506dc32007-08-07 18:36:16 +000076GNU_MAGIC = b"ustar \0" # magic gnu tar string
77POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000078
Guido van Rossumd8faa362007-04-27 19:54:29 +000079LENGTH_NAME = 100 # maximum length of a filename
80LENGTH_LINK = 100 # maximum length of a linkname
81LENGTH_PREFIX = 155 # maximum length of the prefix field
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000082
Lars Gustäbelb506dc32007-08-07 18:36:16 +000083REGTYPE = b"0" # regular file
84AREGTYPE = b"\0" # regular file
85LNKTYPE = b"1" # link (inside tarfile)
86SYMTYPE = b"2" # symbolic link
87CHRTYPE = b"3" # character special device
88BLKTYPE = b"4" # block special device
89DIRTYPE = b"5" # directory
90FIFOTYPE = b"6" # fifo special device
91CONTTYPE = b"7" # contiguous file
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000092
Lars Gustäbelb506dc32007-08-07 18:36:16 +000093GNUTYPE_LONGNAME = b"L" # GNU tar longname
94GNUTYPE_LONGLINK = b"K" # GNU tar longlink
95GNUTYPE_SPARSE = b"S" # GNU tar sparse file
Guido van Rossumd8faa362007-04-27 19:54:29 +000096
Lars Gustäbelb506dc32007-08-07 18:36:16 +000097XHDTYPE = b"x" # POSIX.1-2001 extended header
98XGLTYPE = b"g" # POSIX.1-2001 global header
99SOLARIS_XHDTYPE = b"X" # Solaris extended header
Guido van Rossumd8faa362007-04-27 19:54:29 +0000100
101USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
102GNU_FORMAT = 1 # GNU tar format
103PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
104DEFAULT_FORMAT = GNU_FORMAT
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000105
106#---------------------------------------------------------
107# tarfile constants
108#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000109# File types that tarfile supports:
110SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
111 SYMTYPE, DIRTYPE, FIFOTYPE,
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000112 CONTTYPE, CHRTYPE, BLKTYPE,
113 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
114 GNUTYPE_SPARSE)
115
Guido van Rossumd8faa362007-04-27 19:54:29 +0000116# File types that will be treated as a regular file.
117REGULAR_TYPES = (REGTYPE, AREGTYPE,
118 CONTTYPE, GNUTYPE_SPARSE)
119
120# File types that are part of the GNU tar format.
121GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
122 GNUTYPE_SPARSE)
123
124# Fields from a pax header that override a TarInfo attribute.
125PAX_FIELDS = ("path", "linkpath", "size", "mtime",
126 "uid", "gid", "uname", "gname")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000127
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000128# Fields from a pax header that are affected by hdrcharset.
129PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
130
Guido van Rossume7ba4952007-06-06 23:52:48 +0000131# Fields in a pax header that are numbers, all other fields
132# are treated as strings.
133PAX_NUMBER_FIELDS = {
134 "atime": float,
135 "ctime": float,
136 "mtime": float,
137 "uid": int,
138 "gid": int,
139 "size": int
140}
141
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000142#---------------------------------------------------------
143# Bits used in the mode field, values in octal.
144#---------------------------------------------------------
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000145S_IFLNK = 0o120000 # symbolic link
146S_IFREG = 0o100000 # regular file
147S_IFBLK = 0o060000 # block device
148S_IFDIR = 0o040000 # directory
149S_IFCHR = 0o020000 # character device
150S_IFIFO = 0o010000 # fifo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000151
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000152TSUID = 0o4000 # set UID on execution
153TSGID = 0o2000 # set GID on execution
154TSVTX = 0o1000 # reserved
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000155
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000156TUREAD = 0o400 # read by owner
157TUWRITE = 0o200 # write by owner
158TUEXEC = 0o100 # execute/search by owner
159TGREAD = 0o040 # read by group
160TGWRITE = 0o020 # write by group
161TGEXEC = 0o010 # execute/search by group
162TOREAD = 0o004 # read by other
163TOWRITE = 0o002 # write by other
164TOEXEC = 0o001 # execute/search by other
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000165
166#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000167# initialization
168#---------------------------------------------------------
Victor Stinner0f35e2c2010-06-11 23:46:47 +0000169if os.name in ("nt", "ce"):
170 ENCODING = "utf-8"
171else:
172 ENCODING = sys.getfilesystemencoding()
Guido van Rossumd8faa362007-04-27 19:54:29 +0000173
174#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000175# Some useful functions
176#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000177
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000178def stn(s, length, encoding, errors):
179 """Convert a string to a null-terminated bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000180 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000181 s = s.encode(encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000182 return s[:length] + (length - len(s)) * NUL
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000184def nts(s, encoding, errors):
185 """Convert a null-terminated bytes object to a string.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000186 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000187 p = s.find(b"\0")
188 if p != -1:
189 s = s[:p]
190 return s.decode(encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000191
Thomas Wouters477c8d52006-05-27 19:21:47 +0000192def nti(s):
193 """Convert a number field to a python number.
194 """
195 # There are two possible encodings for a number field, see
196 # itn() below.
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200197 if s[0] in (0o200, 0o377):
198 n = 0
199 for i in range(len(s) - 1):
200 n <<= 8
201 n += s[i + 1]
202 if s[0] == 0o377:
203 n = -(256 ** (len(s) - 1) - n)
204 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000205 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000206 n = int(nts(s, "ascii", "strict") or "0", 8)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000207 except ValueError:
Lars Gustäbel9520a432009-11-22 18:48:49 +0000208 raise InvalidHeaderError("invalid header")
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209 return n
210
Guido van Rossumd8faa362007-04-27 19:54:29 +0000211def itn(n, digits=8, format=DEFAULT_FORMAT):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000212 """Convert a python number to a number field.
213 """
214 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
215 # octal digits followed by a null-byte, this allows values up to
216 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200217 # that if necessary. A leading 0o200 or 0o377 byte indicate this
218 # particular encoding, the following digits-1 bytes are a big-endian
219 # base-256 representation. This allows values up to (256**(digits-1))-1.
220 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
221 # number.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000222 if 0 <= n < 8 ** (digits - 1):
Lars Gustäbela280ca752007-08-28 07:34:33 +0000223 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200224 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
225 if n >= 0:
226 s = bytearray([0o200])
227 else:
228 s = bytearray([0o377])
229 n = 256 ** digits + n
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230
Guido van Rossum805365e2007-05-07 22:24:25 +0000231 for i in range(digits - 1):
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200232 s.insert(1, n & 0o377)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233 n >>= 8
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200234 else:
235 raise ValueError("overflow in number field")
236
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237 return s
238
239def calc_chksums(buf):
240 """Calculate the checksum for a member's header by summing up all
241 characters except for the chksum field which is treated as if
242 it was filled with spaces. According to the GNU tar sources,
243 some tars (Sun and NeXT) calculate chksum with signed char,
244 which will be different if there are chars in the buffer with
245 the high bit set. So we calculate two checksums, unsigned and
246 signed.
247 """
248 unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
249 signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
250 return unsigned_chksum, signed_chksum
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000251
252def copyfileobj(src, dst, length=None):
253 """Copy length bytes from fileobj src to fileobj dst.
254 If length is None, copy the entire content.
255 """
256 if length == 0:
257 return
258 if length is None:
259 shutil.copyfileobj(src, dst)
260 return
261
262 BUFSIZE = 16 * 1024
263 blocks, remainder = divmod(length, BUFSIZE)
Guido van Rossum805365e2007-05-07 22:24:25 +0000264 for b in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000265 buf = src.read(BUFSIZE)
266 if len(buf) < BUFSIZE:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000268 dst.write(buf)
269
270 if remainder != 0:
271 buf = src.read(remainder)
272 if len(buf) < remainder:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000274 dst.write(buf)
275 return
276
277filemode_table = (
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000278 ((S_IFLNK, "l"),
279 (S_IFREG, "-"),
280 (S_IFBLK, "b"),
281 (S_IFDIR, "d"),
282 (S_IFCHR, "c"),
283 (S_IFIFO, "p")),
284
285 ((TUREAD, "r"),),
286 ((TUWRITE, "w"),),
287 ((TUEXEC|TSUID, "s"),
288 (TSUID, "S"),
289 (TUEXEC, "x")),
290
291 ((TGREAD, "r"),),
292 ((TGWRITE, "w"),),
293 ((TGEXEC|TSGID, "s"),
294 (TSGID, "S"),
295 (TGEXEC, "x")),
296
297 ((TOREAD, "r"),),
298 ((TOWRITE, "w"),),
299 ((TOEXEC|TSVTX, "t"),
300 (TSVTX, "T"),
301 (TOEXEC, "x"))
302)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000303
304def filemode(mode):
305 """Convert a file's mode to a string of the form
306 -rwxrwxrwx.
307 Used by TarFile.list()
308 """
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000309 perm = []
310 for table in filemode_table:
311 for bit, char in table:
312 if mode & bit == bit:
313 perm.append(char)
314 break
315 else:
316 perm.append("-")
317 return "".join(perm)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000318
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000319class TarError(Exception):
320 """Base exception."""
321 pass
322class ExtractError(TarError):
323 """General exception for extract errors."""
324 pass
325class ReadError(TarError):
326 """Exception for unreadble tar archives."""
327 pass
328class CompressionError(TarError):
329 """Exception for unavailable compression methods."""
330 pass
331class StreamError(TarError):
332 """Exception for unsupported operations on stream-like TarFiles."""
333 pass
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000334class HeaderError(TarError):
Lars Gustäbel9520a432009-11-22 18:48:49 +0000335 """Base exception for header errors."""
336 pass
337class EmptyHeaderError(HeaderError):
338 """Exception for empty headers."""
339 pass
340class TruncatedHeaderError(HeaderError):
341 """Exception for truncated headers."""
342 pass
343class EOFHeaderError(HeaderError):
344 """Exception for end of file headers."""
345 pass
346class InvalidHeaderError(HeaderError):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000347 """Exception for invalid headers."""
348 pass
Lars Gustäbel9520a432009-11-22 18:48:49 +0000349class SubsequentHeaderError(HeaderError):
350 """Exception for missing and invalid extended headers."""
351 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000352
353#---------------------------
354# internal stream interface
355#---------------------------
356class _LowLevelFile:
357 """Low-level file object. Supports reading and writing.
358 It is used instead of a regular file object for streaming
359 access.
360 """
361
362 def __init__(self, name, mode):
363 mode = {
364 "r": os.O_RDONLY,
365 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
366 }[mode]
367 if hasattr(os, "O_BINARY"):
368 mode |= os.O_BINARY
Lars Gustäbeld6eb70b2010-04-29 15:37:02 +0000369 self.fd = os.open(name, mode, 0o666)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000370
371 def close(self):
372 os.close(self.fd)
373
374 def read(self, size):
375 return os.read(self.fd, size)
376
377 def write(self, s):
378 os.write(self.fd, s)
379
380class _Stream:
381 """Class that serves as an adapter between TarFile and
382 a stream-like object. The stream-like object only
383 needs to have a read() or write() method and is accessed
384 blockwise. Use of gzip or bzip2 compression is possible.
385 A stream-like object could be for example: sys.stdin,
386 sys.stdout, a socket, a tape device etc.
387
388 _Stream is intended to be used only internally.
389 """
390
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000391 def __init__(self, name, mode, comptype, fileobj, bufsize):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000392 """Construct a _Stream object.
393 """
394 self._extfileobj = True
395 if fileobj is None:
396 fileobj = _LowLevelFile(name, mode)
397 self._extfileobj = False
398
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000399 if comptype == '*':
400 # Enable transparent compression detection for the
401 # stream interface
402 fileobj = _StreamProxy(fileobj)
403 comptype = fileobj.getcomptype()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000404
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000405 self.name = name or ""
406 self.mode = mode
407 self.comptype = comptype
408 self.fileobj = fileobj
409 self.bufsize = bufsize
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000410 self.buf = b""
Guido van Rossume2a383d2007-01-15 16:59:06 +0000411 self.pos = 0
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000412 self.closed = False
413
Antoine Pitrou605c2932010-09-23 20:15:14 +0000414 try:
415 if comptype == "gz":
416 try:
417 import zlib
418 except ImportError:
419 raise CompressionError("zlib module is not available")
420 self.zlib = zlib
421 self.crc = zlib.crc32(b"")
422 if mode == "r":
423 self._init_read_gz()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100424 self.exception = zlib.error
Antoine Pitrou605c2932010-09-23 20:15:14 +0000425 else:
426 self._init_write_gz()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000427
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100428 elif comptype == "bz2":
Antoine Pitrou605c2932010-09-23 20:15:14 +0000429 try:
430 import bz2
431 except ImportError:
432 raise CompressionError("bz2 module is not available")
433 if mode == "r":
434 self.dbuf = b""
435 self.cmp = bz2.BZ2Decompressor()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100436 self.exception = IOError
Antoine Pitrou605c2932010-09-23 20:15:14 +0000437 else:
438 self.cmp = bz2.BZ2Compressor()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100439
440 elif comptype == "xz":
441 try:
442 import lzma
443 except ImportError:
444 raise CompressionError("lzma module is not available")
445 if mode == "r":
446 self.dbuf = b""
447 self.cmp = lzma.LZMADecompressor()
448 self.exception = lzma.LZMAError
449 else:
450 self.cmp = lzma.LZMACompressor()
451
452 elif comptype != "tar":
453 raise CompressionError("unknown compression type %r" % comptype)
454
Antoine Pitrou605c2932010-09-23 20:15:14 +0000455 except:
456 if not self._extfileobj:
457 self.fileobj.close()
458 self.closed = True
459 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000460
461 def __del__(self):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000462 if hasattr(self, "closed") and not self.closed:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000463 self.close()
464
465 def _init_write_gz(self):
466 """Initialize for writing with gzip compression.
467 """
468 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
469 -self.zlib.MAX_WBITS,
470 self.zlib.DEF_MEM_LEVEL,
471 0)
Guido van Rossume2a383d2007-01-15 16:59:06 +0000472 timestamp = struct.pack("<L", int(time.time()))
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000473 self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000474 if self.name.endswith(".gz"):
475 self.name = self.name[:-3]
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000476 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
477 self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000478
479 def write(self, s):
480 """Write string s to the stream.
481 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000482 if self.comptype == "gz":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000483 self.crc = self.zlib.crc32(s, self.crc)
484 self.pos += len(s)
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000485 if self.comptype != "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000486 s = self.cmp.compress(s)
487 self.__write(s)
488
489 def __write(self, s):
490 """Write string s to the stream if a whole new block
491 is ready to be written.
492 """
493 self.buf += s
494 while len(self.buf) > self.bufsize:
495 self.fileobj.write(self.buf[:self.bufsize])
496 self.buf = self.buf[self.bufsize:]
497
498 def close(self):
499 """Close the _Stream object. No operation should be
500 done on it afterwards.
501 """
502 if self.closed:
503 return
504
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000505 if self.mode == "w" and self.comptype != "tar":
Martin v. Löwisc234a522004-08-22 21:28:33 +0000506 self.buf += self.cmp.flush()
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000507
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000508 if self.mode == "w" and self.buf:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000509 self.fileobj.write(self.buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000510 self.buf = b""
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000511 if self.comptype == "gz":
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000512 # The native zlib crc is an unsigned 32-bit integer, but
513 # the Python wrapper implicitly casts that to a signed C
514 # long. So, on a 32-bit box self.crc may "look negative",
515 # while the same crc on a 64-bit box may "look positive".
516 # To avoid irksome warnings from the `struct` module, force
517 # it to look positive on all boxes.
Guido van Rossume2a383d2007-01-15 16:59:06 +0000518 self.fileobj.write(struct.pack("<L", self.crc & 0xffffffff))
519 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000520
521 if not self._extfileobj:
522 self.fileobj.close()
523
524 self.closed = True
525
526 def _init_read_gz(self):
527 """Initialize for reading a gzip compressed fileobj.
528 """
529 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000530 self.dbuf = b""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000531
532 # taken from gzip.GzipFile with some alterations
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000533 if self.__read(2) != b"\037\213":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000534 raise ReadError("not a gzip file")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000535 if self.__read(1) != b"\010":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000536 raise CompressionError("unsupported compression method")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000537
538 flag = ord(self.__read(1))
539 self.__read(6)
540
541 if flag & 4:
542 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
543 self.read(xlen)
544 if flag & 8:
545 while True:
546 s = self.__read(1)
547 if not s or s == NUL:
548 break
549 if flag & 16:
550 while True:
551 s = self.__read(1)
552 if not s or s == NUL:
553 break
554 if flag & 2:
555 self.__read(2)
556
557 def tell(self):
558 """Return the stream's file pointer position.
559 """
560 return self.pos
561
562 def seek(self, pos=0):
563 """Set the stream's file pointer to pos. Negative seeking
564 is forbidden.
565 """
566 if pos - self.pos >= 0:
567 blocks, remainder = divmod(pos - self.pos, self.bufsize)
Guido van Rossum805365e2007-05-07 22:24:25 +0000568 for i in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000569 self.read(self.bufsize)
570 self.read(remainder)
571 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572 raise StreamError("seeking backwards is not allowed")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000573 return self.pos
574
575 def read(self, size=None):
576 """Return the next size number of bytes from the stream.
577 If size is not defined, return all bytes of the stream
578 up to EOF.
579 """
580 if size is None:
581 t = []
582 while True:
583 buf = self._read(self.bufsize)
584 if not buf:
585 break
586 t.append(buf)
587 buf = "".join(t)
588 else:
589 buf = self._read(size)
590 self.pos += len(buf)
591 return buf
592
593 def _read(self, size):
594 """Return size bytes from the stream.
595 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000596 if self.comptype == "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000597 return self.__read(size)
598
599 c = len(self.dbuf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000600 while c < size:
601 buf = self.__read(self.bufsize)
602 if not buf:
603 break
Guido van Rossumd8faa362007-04-27 19:54:29 +0000604 try:
605 buf = self.cmp.decompress(buf)
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100606 except self.exception:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000607 raise ReadError("invalid compressed data")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000608 self.dbuf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000609 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000610 buf = self.dbuf[:size]
611 self.dbuf = self.dbuf[size:]
612 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000613
614 def __read(self, size):
615 """Return size bytes from stream. If internal buffer is empty,
616 read another block from the stream.
617 """
618 c = len(self.buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000619 while c < size:
620 buf = self.fileobj.read(self.bufsize)
621 if not buf:
622 break
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000623 self.buf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000624 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000625 buf = self.buf[:size]
626 self.buf = self.buf[size:]
627 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000628# class _Stream
629
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000630class _StreamProxy(object):
631 """Small proxy class that enables transparent compression
632 detection for the Stream interface (mode 'r|*').
633 """
634
635 def __init__(self, fileobj):
636 self.fileobj = fileobj
637 self.buf = self.fileobj.read(BLOCKSIZE)
638
639 def read(self, size):
640 self.read = self.fileobj.read
641 return self.buf
642
643 def getcomptype(self):
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100644 if self.buf.startswith(b"\x1f\x8b\x08"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000645 return "gz"
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100646 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000647 return "bz2"
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100648 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
649 return "xz"
650 else:
651 return "tar"
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000652
653 def close(self):
654 self.fileobj.close()
655# class StreamProxy
656
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000657#------------------------
658# Extraction file object
659#------------------------
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000660class _FileInFile(object):
661 """A thin wrapper around an existing file object that
662 provides a part of its data as an individual file
663 object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000664 """
665
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000666 def __init__(self, fileobj, offset, size, blockinfo=None):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000667 self.fileobj = fileobj
668 self.offset = offset
669 self.size = size
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000670 self.position = 0
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200671 self.name = getattr(fileobj, "name", None)
672 self.closed = False
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000673
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000674 if blockinfo is None:
675 blockinfo = [(0, size)]
676
677 # Construct a map with data and zero blocks.
678 self.map_index = 0
679 self.map = []
680 lastpos = 0
681 realpos = self.offset
682 for offset, size in blockinfo:
683 if offset > lastpos:
684 self.map.append((False, lastpos, offset, None))
685 self.map.append((True, offset, offset + size, realpos))
686 realpos += size
687 lastpos = offset + size
688 if lastpos < self.size:
689 self.map.append((False, lastpos, self.size, None))
690
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200691 def flush(self):
692 pass
693
694 def readable(self):
695 return True
696
697 def writable(self):
698 return False
699
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000700 def seekable(self):
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000701 return self.fileobj.seekable()
702
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000703 def tell(self):
704 """Return the current file position.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000705 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000706 return self.position
707
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200708 def seek(self, position, whence=io.SEEK_SET):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000709 """Seek to a position in the file.
710 """
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200711 if whence == io.SEEK_SET:
712 self.position = min(max(position, 0), self.size)
713 elif whence == io.SEEK_CUR:
714 if position < 0:
715 self.position = max(self.position + position, 0)
716 else:
717 self.position = min(self.position + position, self.size)
718 elif whence == io.SEEK_END:
719 self.position = max(min(self.size + position, self.size), 0)
720 else:
721 raise ValueError("Invalid argument")
722 return self.position
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000723
724 def read(self, size=None):
725 """Read data from the file.
726 """
727 if size is None:
728 size = self.size - self.position
729 else:
730 size = min(size, self.size - self.position)
731
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000732 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000733 while size > 0:
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000734 while True:
735 data, start, stop, offset = self.map[self.map_index]
736 if start <= self.position < stop:
737 break
738 else:
739 self.map_index += 1
740 if self.map_index == len(self.map):
741 self.map_index = 0
742 length = min(size, stop - self.position)
743 if data:
Lars Gustäbeldd071042011-02-23 11:42:22 +0000744 self.fileobj.seek(offset + (self.position - start))
745 buf += self.fileobj.read(length)
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000746 else:
747 buf += NUL * length
748 size -= length
749 self.position += length
750 return buf
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000751
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200752 def readinto(self, b):
753 buf = self.read(len(b))
754 b[:len(buf)] = buf
755 return len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000756
757 def close(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000758 self.closed = True
Lars Gustäbel7a919e92012-05-05 18:15:03 +0200759#class _FileInFile
Martin v. Löwisdf241532005-03-03 08:17:42 +0000760
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000761
762#------------------
763# Exported Classes
764#------------------
765class TarInfo(object):
766 """Informational class which holds the details about an
767 archive member given by a tar header block.
768 TarInfo objects are returned by TarFile.getmember(),
769 TarFile.getmembers() and TarFile.gettarinfo() and are
770 usually created internally.
771 """
772
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000773 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
774 "chksum", "type", "linkname", "uname", "gname",
775 "devmajor", "devminor",
776 "offset", "offset_data", "pax_headers", "sparse",
777 "tarfile", "_sparse_structs", "_link_target")
778
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000779 def __init__(self, name=""):
780 """Construct a TarInfo object. name is the optional name
781 of the member.
782 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000783 self.name = name # member name
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000784 self.mode = 0o644 # file permissions
Thomas Wouters477c8d52006-05-27 19:21:47 +0000785 self.uid = 0 # user id
786 self.gid = 0 # group id
787 self.size = 0 # file size
788 self.mtime = 0 # modification time
789 self.chksum = 0 # header checksum
790 self.type = REGTYPE # member type
791 self.linkname = "" # link name
Lars Gustäbel331b8002010-10-04 15:18:47 +0000792 self.uname = "" # user name
793 self.gname = "" # group name
Thomas Wouters477c8d52006-05-27 19:21:47 +0000794 self.devmajor = 0 # device major number
795 self.devminor = 0 # device minor number
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000796
Thomas Wouters477c8d52006-05-27 19:21:47 +0000797 self.offset = 0 # the tar header starts here
798 self.offset_data = 0 # the file's data starts here
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000799
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000800 self.sparse = None # sparse member information
Guido van Rossumd8faa362007-04-27 19:54:29 +0000801 self.pax_headers = {} # pax header information
802
803 # In pax headers the "name" and "linkname" field are called
804 # "path" and "linkpath".
805 def _getpath(self):
806 return self.name
807 def _setpath(self, name):
808 self.name = name
809 path = property(_getpath, _setpath)
810
811 def _getlinkpath(self):
812 return self.linkname
813 def _setlinkpath(self, linkname):
814 self.linkname = linkname
815 linkpath = property(_getlinkpath, _setlinkpath)
816
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000817 def __repr__(self):
818 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
819
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000820 def get_info(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000821 """Return the TarInfo's attributes as a dictionary.
822 """
823 info = {
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000824 "name": self.name,
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000825 "mode": self.mode & 0o7777,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000826 "uid": self.uid,
827 "gid": self.gid,
828 "size": self.size,
829 "mtime": self.mtime,
830 "chksum": self.chksum,
831 "type": self.type,
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000832 "linkname": self.linkname,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000833 "uname": self.uname,
834 "gname": self.gname,
835 "devmajor": self.devmajor,
836 "devminor": self.devminor
837 }
838
839 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
840 info["name"] += "/"
841
842 return info
843
Victor Stinnerde629d42010-05-05 21:43:57 +0000844 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000845 """Return a tar header as a string of 512 byte blocks.
846 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000847 info = self.get_info()
Guido van Rossume7ba4952007-06-06 23:52:48 +0000848
Guido van Rossumd8faa362007-04-27 19:54:29 +0000849 if format == USTAR_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000850 return self.create_ustar_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000851 elif format == GNU_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000852 return self.create_gnu_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000853 elif format == PAX_FORMAT:
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000854 return self.create_pax_header(info, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000855 else:
856 raise ValueError("invalid format")
857
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000858 def create_ustar_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000859 """Return the object as a ustar header block.
860 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000861 info["magic"] = POSIX_MAGIC
862
863 if len(info["linkname"]) > LENGTH_LINK:
864 raise ValueError("linkname is too long")
865
866 if len(info["name"]) > LENGTH_NAME:
867 info["prefix"], info["name"] = self._posix_split_name(info["name"])
868
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000869 return self._create_header(info, USTAR_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000870
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000871 def create_gnu_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000872 """Return the object as a GNU header block sequence.
873 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000874 info["magic"] = GNU_MAGIC
875
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000876 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +0000877 if len(info["linkname"]) > LENGTH_LINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000878 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000879
880 if len(info["name"]) > LENGTH_NAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000881 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000882
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000883 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000884
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000885 def create_pax_header(self, info, encoding):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000886 """Return the object as a ustar header block. If it cannot be
887 represented this way, prepend a pax extended header sequence
888 with supplement information.
889 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000890 info["magic"] = POSIX_MAGIC
891 pax_headers = self.pax_headers.copy()
892
893 # Test string fields for values that exceed the field length or cannot
894 # be represented in ASCII encoding.
895 for name, hname, length in (
896 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
897 ("uname", "uname", 32), ("gname", "gname", 32)):
898
Guido van Rossume7ba4952007-06-06 23:52:48 +0000899 if hname in pax_headers:
900 # The pax header has priority.
901 continue
902
Guido van Rossumd8faa362007-04-27 19:54:29 +0000903 # Try to encode the string as ASCII.
904 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000905 info[name].encode("ascii", "strict")
Guido van Rossumd8faa362007-04-27 19:54:29 +0000906 except UnicodeEncodeError:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000907 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +0000908 continue
909
Guido van Rossume7ba4952007-06-06 23:52:48 +0000910 if len(info[name]) > length:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000911 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +0000912
913 # Test number fields for values that exceed the field limit or values
914 # that like to be stored as float.
915 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
Guido van Rossume7ba4952007-06-06 23:52:48 +0000916 if name in pax_headers:
917 # The pax header has priority. Avoid overflow.
918 info[name] = 0
919 continue
920
Guido van Rossumd8faa362007-04-27 19:54:29 +0000921 val = info[name]
922 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000923 pax_headers[name] = str(val)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000924 info[name] = 0
925
Guido van Rossume7ba4952007-06-06 23:52:48 +0000926 # Create a pax extended header if necessary.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000927 if pax_headers:
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000928 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000929 else:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000930 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +0000931
Lars Gustäbel3741eff2007-08-21 12:17:05 +0000932 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
Guido van Rossumd8faa362007-04-27 19:54:29 +0000933
934 @classmethod
Lars Gustäbel3741eff2007-08-21 12:17:05 +0000935 def create_pax_global_header(cls, pax_headers):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000936 """Return the object as a pax global header block sequence.
937 """
Marc-André Lemburg8f36af72011-02-25 15:42:01 +0000938 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
Guido van Rossumd8faa362007-04-27 19:54:29 +0000939
940 def _posix_split_name(self, name):
941 """Split a name longer than 100 chars into a prefix
942 and a name part.
943 """
944 prefix = name[:LENGTH_PREFIX + 1]
945 while prefix and prefix[-1] != "/":
946 prefix = prefix[:-1]
947
948 name = name[len(prefix):]
949 prefix = prefix[:-1]
950
951 if not prefix or len(name) > LENGTH_NAME:
952 raise ValueError("name is too long")
953 return prefix, name
954
955 @staticmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000956 def _create_header(info, format, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000957 """Return a header block. info is a dictionary with file
958 information, format must be one of the *_FORMAT constants.
959 """
960 parts = [
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000961 stn(info.get("name", ""), 100, encoding, errors),
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000962 itn(info.get("mode", 0) & 0o7777, 8, format),
Guido van Rossumd8faa362007-04-27 19:54:29 +0000963 itn(info.get("uid", 0), 8, format),
964 itn(info.get("gid", 0), 8, format),
965 itn(info.get("size", 0), 12, format),
966 itn(info.get("mtime", 0), 12, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000967 b" ", # checksum field
Guido van Rossumd8faa362007-04-27 19:54:29 +0000968 info.get("type", REGTYPE),
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000969 stn(info.get("linkname", ""), 100, encoding, errors),
970 info.get("magic", POSIX_MAGIC),
Lars Gustäbel331b8002010-10-04 15:18:47 +0000971 stn(info.get("uname", ""), 32, encoding, errors),
972 stn(info.get("gname", ""), 32, encoding, errors),
Guido van Rossumd8faa362007-04-27 19:54:29 +0000973 itn(info.get("devmajor", 0), 8, format),
974 itn(info.get("devminor", 0), 8, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000975 stn(info.get("prefix", ""), 155, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000976 ]
977
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000978 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
Guido van Rossumd8faa362007-04-27 19:54:29 +0000979 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
Lars Gustäbela280ca752007-08-28 07:34:33 +0000980 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
Guido van Rossumd8faa362007-04-27 19:54:29 +0000981 return buf
982
983 @staticmethod
984 def _create_payload(payload):
985 """Return the string payload filled with zero bytes
986 up to the next 512 byte border.
987 """
988 blocks, remainder = divmod(len(payload), BLOCKSIZE)
989 if remainder > 0:
990 payload += (BLOCKSIZE - remainder) * NUL
991 return payload
992
993 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000994 def _create_gnu_long_header(cls, name, type, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000995 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
996 for name.
997 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000998 name = name.encode(encoding, errors) + NUL
Guido van Rossumd8faa362007-04-27 19:54:29 +0000999
1000 info = {}
1001 info["name"] = "././@LongLink"
1002 info["type"] = type
1003 info["size"] = len(name)
1004 info["magic"] = GNU_MAGIC
1005
1006 # create extended header + name blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001007 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001008 cls._create_payload(name)
1009
1010 @classmethod
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001011 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1012 """Return a POSIX.1-2008 extended or global header sequence
Guido van Rossumd8faa362007-04-27 19:54:29 +00001013 that contains a list of keyword, value pairs. The values
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001014 must be strings.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001015 """
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001016 # Check if one of the fields contains surrogate characters and thereby
1017 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1018 binary = False
1019 for keyword, value in pax_headers.items():
1020 try:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001021 value.encode("utf-8", "strict")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001022 except UnicodeEncodeError:
1023 binary = True
1024 break
1025
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001026 records = b""
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001027 if binary:
1028 # Put the hdrcharset field at the beginning of the header.
1029 records += b"21 hdrcharset=BINARY\n"
1030
Guido van Rossumd8faa362007-04-27 19:54:29 +00001031 for keyword, value in pax_headers.items():
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001032 keyword = keyword.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001033 if binary:
1034 # Try to restore the original byte representation of `value'.
1035 # Needless to say, that the encoding must match the string.
1036 value = value.encode(encoding, "surrogateescape")
1037 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001038 value = value.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001039
Guido van Rossumd8faa362007-04-27 19:54:29 +00001040 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1041 n = p = 0
1042 while True:
1043 n = l + len(str(p))
1044 if n == p:
1045 break
1046 p = n
Lars Gustäbela280ca752007-08-28 07:34:33 +00001047 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
Guido van Rossumd8faa362007-04-27 19:54:29 +00001048
1049 # We use a hardcoded "././@PaxHeader" name like star does
1050 # instead of the one that POSIX recommends.
1051 info = {}
1052 info["name"] = "././@PaxHeader"
1053 info["type"] = type
1054 info["size"] = len(records)
1055 info["magic"] = POSIX_MAGIC
1056
1057 # Create pax header + record blocks.
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001058 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001059 cls._create_payload(records)
1060
Guido van Rossum75b64e62005-01-16 00:16:11 +00001061 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001062 def frombuf(cls, buf, encoding, errors):
1063 """Construct a TarInfo object from a 512 byte bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001064 """
Lars Gustäbel9520a432009-11-22 18:48:49 +00001065 if len(buf) == 0:
1066 raise EmptyHeaderError("empty header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001067 if len(buf) != BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001068 raise TruncatedHeaderError("truncated header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001069 if buf.count(NUL) == BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001070 raise EOFHeaderError("end of file header")
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001071
1072 chksum = nti(buf[148:156])
1073 if chksum not in calc_chksums(buf):
Lars Gustäbel9520a432009-11-22 18:48:49 +00001074 raise InvalidHeaderError("bad checksum")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001075
Guido van Rossumd8faa362007-04-27 19:54:29 +00001076 obj = cls()
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001077 obj.name = nts(buf[0:100], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001078 obj.mode = nti(buf[100:108])
1079 obj.uid = nti(buf[108:116])
1080 obj.gid = nti(buf[116:124])
1081 obj.size = nti(buf[124:136])
1082 obj.mtime = nti(buf[136:148])
1083 obj.chksum = chksum
1084 obj.type = buf[156:157]
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001085 obj.linkname = nts(buf[157:257], encoding, errors)
1086 obj.uname = nts(buf[265:297], encoding, errors)
1087 obj.gname = nts(buf[297:329], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001088 obj.devmajor = nti(buf[329:337])
1089 obj.devminor = nti(buf[337:345])
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001090 prefix = nts(buf[345:500], encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001091
Guido van Rossumd8faa362007-04-27 19:54:29 +00001092 # Old V7 tar format represents a directory as a regular
1093 # file with a trailing slash.
1094 if obj.type == AREGTYPE and obj.name.endswith("/"):
1095 obj.type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001096
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001097 # The old GNU sparse format occupies some of the unused
1098 # space in the buffer for up to 4 sparse structures.
1099 # Save the them for later processing in _proc_sparse().
1100 if obj.type == GNUTYPE_SPARSE:
1101 pos = 386
1102 structs = []
1103 for i in range(4):
1104 try:
1105 offset = nti(buf[pos:pos + 12])
1106 numbytes = nti(buf[pos + 12:pos + 24])
1107 except ValueError:
1108 break
1109 structs.append((offset, numbytes))
1110 pos += 24
1111 isextended = bool(buf[482])
1112 origsize = nti(buf[483:495])
1113 obj._sparse_structs = (structs, isextended, origsize)
1114
Guido van Rossumd8faa362007-04-27 19:54:29 +00001115 # Remove redundant slashes from directories.
1116 if obj.isdir():
1117 obj.name = obj.name.rstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001118
Guido van Rossumd8faa362007-04-27 19:54:29 +00001119 # Reconstruct a ustar longname.
1120 if prefix and obj.type not in GNU_TYPES:
1121 obj.name = prefix + "/" + obj.name
1122 return obj
1123
1124 @classmethod
1125 def fromtarfile(cls, tarfile):
1126 """Return the next TarInfo object from TarFile object
1127 tarfile.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001128 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001129 buf = tarfile.fileobj.read(BLOCKSIZE)
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001130 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001131 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1132 return obj._proc_member(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001133
Guido van Rossumd8faa362007-04-27 19:54:29 +00001134 #--------------------------------------------------------------------------
1135 # The following are methods that are called depending on the type of a
1136 # member. The entry point is _proc_member() which can be overridden in a
1137 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1138 # implement the following
1139 # operations:
1140 # 1. Set self.offset_data to the position where the data blocks begin,
1141 # if there is data that follows.
1142 # 2. Set tarfile.offset to the position where the next member's header will
1143 # begin.
1144 # 3. Return self or another valid TarInfo object.
1145 def _proc_member(self, tarfile):
1146 """Choose the right processing method depending on
1147 the type and call it.
Thomas Wouters89f507f2006-12-13 04:49:30 +00001148 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001149 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1150 return self._proc_gnulong(tarfile)
1151 elif self.type == GNUTYPE_SPARSE:
1152 return self._proc_sparse(tarfile)
1153 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1154 return self._proc_pax(tarfile)
1155 else:
1156 return self._proc_builtin(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001157
Guido van Rossumd8faa362007-04-27 19:54:29 +00001158 def _proc_builtin(self, tarfile):
1159 """Process a builtin type or an unknown type which
1160 will be treated as a regular file.
1161 """
1162 self.offset_data = tarfile.fileobj.tell()
1163 offset = self.offset_data
1164 if self.isreg() or self.type not in SUPPORTED_TYPES:
1165 # Skip the following data blocks.
1166 offset += self._block(self.size)
1167 tarfile.offset = offset
Thomas Wouters89f507f2006-12-13 04:49:30 +00001168
Guido van Rossume7ba4952007-06-06 23:52:48 +00001169 # Patch the TarInfo object with saved global
Guido van Rossumd8faa362007-04-27 19:54:29 +00001170 # header information.
Guido van Rossume7ba4952007-06-06 23:52:48 +00001171 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001172
1173 return self
1174
1175 def _proc_gnulong(self, tarfile):
1176 """Process the blocks that hold a GNU longname
1177 or longlink member.
1178 """
1179 buf = tarfile.fileobj.read(self._block(self.size))
1180
1181 # Fetch the next header and process it.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001182 try:
1183 next = self.fromtarfile(tarfile)
1184 except HeaderError:
1185 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001186
1187 # Patch the TarInfo object from the next header with
1188 # the longname information.
1189 next.offset = self.offset
1190 if self.type == GNUTYPE_LONGNAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001191 next.name = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001192 elif self.type == GNUTYPE_LONGLINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001193 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001194
1195 return next
1196
1197 def _proc_sparse(self, tarfile):
1198 """Process a GNU sparse header plus extra headers.
1199 """
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001200 # We already collected some sparse structures in frombuf().
1201 structs, isextended, origsize = self._sparse_structs
1202 del self._sparse_structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001203
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001204 # Collect sparse structures from extended header blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001205 while isextended:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001206 buf = tarfile.fileobj.read(BLOCKSIZE)
1207 pos = 0
Guido van Rossum805365e2007-05-07 22:24:25 +00001208 for i in range(21):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001209 try:
1210 offset = nti(buf[pos:pos + 12])
1211 numbytes = nti(buf[pos + 12:pos + 24])
1212 except ValueError:
1213 break
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001214 if offset and numbytes:
1215 structs.append((offset, numbytes))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001216 pos += 24
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001217 isextended = bool(buf[504])
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001218 self.sparse = structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001219
1220 self.offset_data = tarfile.fileobj.tell()
1221 tarfile.offset = self.offset_data + self._block(self.size)
1222 self.size = origsize
Guido van Rossumd8faa362007-04-27 19:54:29 +00001223 return self
1224
1225 def _proc_pax(self, tarfile):
1226 """Process an extended or global header as described in
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001227 POSIX.1-2008.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001228 """
1229 # Read the header information.
1230 buf = tarfile.fileobj.read(self._block(self.size))
1231
1232 # A pax header stores supplemental information for either
1233 # the following file (extended) or all following files
1234 # (global).
1235 if self.type == XGLTYPE:
1236 pax_headers = tarfile.pax_headers
1237 else:
1238 pax_headers = tarfile.pax_headers.copy()
1239
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001240 # Check if the pax header contains a hdrcharset field. This tells us
1241 # the encoding of the path, linkpath, uname and gname fields. Normally,
1242 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1243 # implementations are allowed to store them as raw binary strings if
1244 # the translation to UTF-8 fails.
1245 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1246 if match is not None:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001247 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001248
1249 # For the time being, we don't care about anything other than "BINARY".
1250 # The only other value that is currently allowed by the standard is
1251 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1252 hdrcharset = pax_headers.get("hdrcharset")
1253 if hdrcharset == "BINARY":
1254 encoding = tarfile.encoding
1255 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001256 encoding = "utf-8"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001257
Guido van Rossumd8faa362007-04-27 19:54:29 +00001258 # Parse pax header information. A record looks like that:
1259 # "%d %s=%s\n" % (length, keyword, value). length is the size
1260 # of the complete record including the length field itself and
Guido van Rossume7ba4952007-06-06 23:52:48 +00001261 # the newline. keyword and value are both UTF-8 encoded strings.
Antoine Pitroufd036452008-08-19 17:56:33 +00001262 regex = re.compile(br"(\d+) ([^=]+)=")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001263 pos = 0
1264 while True:
1265 match = regex.match(buf, pos)
1266 if not match:
1267 break
1268
1269 length, keyword = match.groups()
1270 length = int(length)
1271 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1272
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001273 # Normally, we could just use "utf-8" as the encoding and "strict"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001274 # as the error handler, but we better not take the risk. For
1275 # example, GNU tar <= 1.23 is known to store filenames it cannot
1276 # translate to UTF-8 as raw strings (unfortunately without a
1277 # hdrcharset=BINARY header).
1278 # We first try the strict standard encoding, and if that fails we
1279 # fall back on the user's encoding and error handler.
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001280 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001281 tarfile.errors)
1282 if keyword in PAX_NAME_FIELDS:
1283 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1284 tarfile.errors)
1285 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001286 value = self._decode_pax_field(value, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001287 tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001288
1289 pax_headers[keyword] = value
1290 pos += length
1291
Guido van Rossume7ba4952007-06-06 23:52:48 +00001292 # Fetch the next header.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001293 try:
1294 next = self.fromtarfile(tarfile)
1295 except HeaderError:
1296 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001297
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001298 # Process GNU sparse information.
1299 if "GNU.sparse.map" in pax_headers:
1300 # GNU extended sparse format version 0.1.
1301 self._proc_gnusparse_01(next, pax_headers)
1302
1303 elif "GNU.sparse.size" in pax_headers:
1304 # GNU extended sparse format version 0.0.
1305 self._proc_gnusparse_00(next, pax_headers, buf)
1306
1307 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1308 # GNU extended sparse format version 1.0.
1309 self._proc_gnusparse_10(next, pax_headers, tarfile)
1310
Guido van Rossume7ba4952007-06-06 23:52:48 +00001311 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001312 # Patch the TarInfo object with the extended header info.
1313 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1314 next.offset = self.offset
1315
1316 if "size" in pax_headers:
1317 # If the extended header replaces the size field,
1318 # we need to recalculate the offset where the next
1319 # header starts.
1320 offset = next.offset_data
1321 if next.isreg() or next.type not in SUPPORTED_TYPES:
1322 offset += next._block(next.size)
1323 tarfile.offset = offset
1324
1325 return next
1326
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001327 def _proc_gnusparse_00(self, next, pax_headers, buf):
1328 """Process a GNU tar extended sparse header, version 0.0.
1329 """
1330 offsets = []
1331 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1332 offsets.append(int(match.group(1)))
1333 numbytes = []
1334 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1335 numbytes.append(int(match.group(1)))
1336 next.sparse = list(zip(offsets, numbytes))
1337
1338 def _proc_gnusparse_01(self, next, pax_headers):
1339 """Process a GNU tar extended sparse header, version 0.1.
1340 """
1341 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1342 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1343
1344 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1345 """Process a GNU tar extended sparse header, version 1.0.
1346 """
1347 fields = None
1348 sparse = []
1349 buf = tarfile.fileobj.read(BLOCKSIZE)
1350 fields, buf = buf.split(b"\n", 1)
1351 fields = int(fields)
1352 while len(sparse) < fields * 2:
1353 if b"\n" not in buf:
1354 buf += tarfile.fileobj.read(BLOCKSIZE)
1355 number, buf = buf.split(b"\n", 1)
1356 sparse.append(int(number))
1357 next.offset_data = tarfile.fileobj.tell()
1358 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1359
Guido van Rossume7ba4952007-06-06 23:52:48 +00001360 def _apply_pax_info(self, pax_headers, encoding, errors):
1361 """Replace fields with supplemental information from a previous
1362 pax extended or global header.
1363 """
1364 for keyword, value in pax_headers.items():
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001365 if keyword == "GNU.sparse.name":
1366 setattr(self, "path", value)
1367 elif keyword == "GNU.sparse.size":
1368 setattr(self, "size", int(value))
1369 elif keyword == "GNU.sparse.realsize":
1370 setattr(self, "size", int(value))
1371 elif keyword in PAX_FIELDS:
1372 if keyword in PAX_NUMBER_FIELDS:
1373 try:
1374 value = PAX_NUMBER_FIELDS[keyword](value)
1375 except ValueError:
1376 value = 0
1377 if keyword == "path":
1378 value = value.rstrip("/")
1379 setattr(self, keyword, value)
Guido van Rossume7ba4952007-06-06 23:52:48 +00001380
1381 self.pax_headers = pax_headers.copy()
Guido van Rossumd8faa362007-04-27 19:54:29 +00001382
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001383 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1384 """Decode a single field from a pax record.
1385 """
1386 try:
1387 return value.decode(encoding, "strict")
1388 except UnicodeDecodeError:
1389 return value.decode(fallback_encoding, fallback_errors)
1390
Guido van Rossumd8faa362007-04-27 19:54:29 +00001391 def _block(self, count):
1392 """Round up a byte count by BLOCKSIZE and return it,
1393 e.g. _block(834) => 1024.
1394 """
1395 blocks, remainder = divmod(count, BLOCKSIZE)
1396 if remainder:
1397 blocks += 1
1398 return blocks * BLOCKSIZE
Thomas Wouters89f507f2006-12-13 04:49:30 +00001399
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001400 def isreg(self):
1401 return self.type in REGULAR_TYPES
1402 def isfile(self):
1403 return self.isreg()
1404 def isdir(self):
1405 return self.type == DIRTYPE
1406 def issym(self):
1407 return self.type == SYMTYPE
1408 def islnk(self):
1409 return self.type == LNKTYPE
1410 def ischr(self):
1411 return self.type == CHRTYPE
1412 def isblk(self):
1413 return self.type == BLKTYPE
1414 def isfifo(self):
1415 return self.type == FIFOTYPE
1416 def issparse(self):
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001417 return self.sparse is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001418 def isdev(self):
1419 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1420# class TarInfo
1421
1422class TarFile(object):
1423 """The TarFile Class provides an interface to tar archives.
1424 """
1425
1426 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1427
1428 dereference = False # If true, add content of linked file to the
1429 # tar file, else the link.
1430
1431 ignore_zeros = False # If true, skips empty or invalid blocks and
1432 # continues processing.
1433
Lars Gustäbel365aff32009-12-13 11:42:29 +00001434 errorlevel = 1 # If 0, fatal errors only appear in debug
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001435 # messages (if debug >= 0). If > 0, errors
1436 # are passed to the caller as exceptions.
1437
Guido van Rossumd8faa362007-04-27 19:54:29 +00001438 format = DEFAULT_FORMAT # The format to use when creating an archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001439
Guido van Rossume7ba4952007-06-06 23:52:48 +00001440 encoding = ENCODING # Encoding for 8-bit character strings.
1441
1442 errors = None # Error handler for unicode conversion.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001443
Guido van Rossumd8faa362007-04-27 19:54:29 +00001444 tarinfo = TarInfo # The default TarInfo class to use.
1445
Lars Gustäbel7a919e92012-05-05 18:15:03 +02001446 fileobject = None # The file-object for extractfile() or
1447 # io.BufferedReader if None.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001448
1449 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1450 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
Victor Stinnerde629d42010-05-05 21:43:57 +00001451 errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001452 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1453 read from an existing archive, 'a' to append data to an existing
1454 file or 'w' to create a new file overwriting an existing one. `mode'
1455 defaults to 'r'.
1456 If `fileobj' is given, it is used for reading or writing data. If it
1457 can be determined, `mode' is overridden by `fileobj's mode.
1458 `fileobj' is not closed, when TarFile is closed.
1459 """
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001460 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001461 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001462 self.mode = mode
1463 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001464
1465 if not fileobj:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001466 if self.mode == "a" and not os.path.exists(name):
Thomas Wouterscf297e42007-02-23 15:07:44 +00001467 # Create nonexistent files in append mode.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001468 self.mode = "w"
1469 self._mode = "wb"
Guido van Rossume7ba4952007-06-06 23:52:48 +00001470 fileobj = bltn_open(name, self._mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001471 self._extfileobj = False
1472 else:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001473 if name is None and hasattr(fileobj, "name"):
1474 name = fileobj.name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001475 if hasattr(fileobj, "mode"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001476 self._mode = fileobj.mode
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001477 self._extfileobj = True
Thomas Woutersed03b412007-08-28 21:37:11 +00001478 self.name = os.path.abspath(name) if name else None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001479 self.fileobj = fileobj
1480
Guido van Rossumd8faa362007-04-27 19:54:29 +00001481 # Init attributes.
1482 if format is not None:
1483 self.format = format
1484 if tarinfo is not None:
1485 self.tarinfo = tarinfo
1486 if dereference is not None:
1487 self.dereference = dereference
1488 if ignore_zeros is not None:
1489 self.ignore_zeros = ignore_zeros
1490 if encoding is not None:
1491 self.encoding = encoding
Victor Stinnerde629d42010-05-05 21:43:57 +00001492 self.errors = errors
Guido van Rossume7ba4952007-06-06 23:52:48 +00001493
1494 if pax_headers is not None and self.format == PAX_FORMAT:
1495 self.pax_headers = pax_headers
1496 else:
1497 self.pax_headers = {}
1498
Guido van Rossumd8faa362007-04-27 19:54:29 +00001499 if debug is not None:
1500 self.debug = debug
1501 if errorlevel is not None:
1502 self.errorlevel = errorlevel
1503
1504 # Init datastructures.
Thomas Wouters477c8d52006-05-27 19:21:47 +00001505 self.closed = False
1506 self.members = [] # list of members as TarInfo objects
1507 self._loaded = False # flag if all members have been read
Christian Heimesd8654cf2007-12-02 15:22:16 +00001508 self.offset = self.fileobj.tell()
1509 # current position in the archive file
Thomas Wouters477c8d52006-05-27 19:21:47 +00001510 self.inodes = {} # dictionary caching the inodes of
1511 # archive members already added
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001512
Lars Gustäbel7b465392009-11-18 20:29:25 +00001513 try:
1514 if self.mode == "r":
1515 self.firstmember = None
1516 self.firstmember = self.next()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001517
Lars Gustäbel7b465392009-11-18 20:29:25 +00001518 if self.mode == "a":
1519 # Move to the end of the archive,
1520 # before the first empty block.
Lars Gustäbel7b465392009-11-18 20:29:25 +00001521 while True:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001522 self.fileobj.seek(self.offset)
1523 try:
1524 tarinfo = self.tarinfo.fromtarfile(self)
1525 self.members.append(tarinfo)
1526 except EOFHeaderError:
1527 self.fileobj.seek(self.offset)
Lars Gustäbel7b465392009-11-18 20:29:25 +00001528 break
Lars Gustäbel9520a432009-11-22 18:48:49 +00001529 except HeaderError as e:
1530 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001531
Lars Gustäbel7b465392009-11-18 20:29:25 +00001532 if self.mode in "aw":
1533 self._loaded = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001534
Lars Gustäbel7b465392009-11-18 20:29:25 +00001535 if self.pax_headers:
1536 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1537 self.fileobj.write(buf)
1538 self.offset += len(buf)
1539 except:
1540 if not self._extfileobj:
1541 self.fileobj.close()
1542 self.closed = True
1543 raise
Guido van Rossumd8faa362007-04-27 19:54:29 +00001544
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001545 #--------------------------------------------------------------------------
1546 # Below are the classmethods which act as alternate constructors to the
1547 # TarFile class. The open() method is the only one that is needed for
1548 # public use; it is the "super"-constructor and is able to select an
1549 # adequate "sub"-constructor for a particular compression using the mapping
1550 # from OPEN_METH.
1551 #
1552 # This concept allows one to subclass TarFile without losing the comfort of
1553 # the super-constructor. A sub-constructor is registered and made available
1554 # by adding it to the mapping in OPEN_METH.
1555
Guido van Rossum75b64e62005-01-16 00:16:11 +00001556 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001557 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001558 """Open a tar archive for reading, writing or appending. Return
1559 an appropriate TarFile class.
1560
1561 mode:
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001562 'r' or 'r:*' open for reading with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001563 'r:' open for reading exclusively uncompressed
1564 'r:gz' open for reading with gzip compression
1565 'r:bz2' open for reading with bzip2 compression
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001566 'r:xz' open for reading with lzma compression
Thomas Wouterscf297e42007-02-23 15:07:44 +00001567 'a' or 'a:' open for appending, creating the file if necessary
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001568 'w' or 'w:' open for writing without compression
1569 'w:gz' open for writing with gzip compression
1570 'w:bz2' open for writing with bzip2 compression
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001571 'w:xz' open for writing with lzma compression
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001572
1573 'r|*' open a stream of tar blocks with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001574 'r|' open an uncompressed stream of tar blocks for reading
1575 'r|gz' open a gzip compressed stream of tar blocks
1576 'r|bz2' open a bzip2 compressed stream of tar blocks
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001577 'r|xz' open an lzma compressed stream of tar blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001578 'w|' open an uncompressed stream for writing
1579 'w|gz' open a gzip compressed stream for writing
1580 'w|bz2' open a bzip2 compressed stream for writing
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001581 'w|xz' open an lzma compressed stream for writing
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001582 """
1583
1584 if not name and not fileobj:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001585 raise ValueError("nothing to open")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001586
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001587 if mode in ("r", "r:*"):
1588 # Find out which *open() is appropriate for opening the file.
1589 for comptype in cls.OPEN_METH:
1590 func = getattr(cls, cls.OPEN_METH[comptype])
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001591 if fileobj is not None:
1592 saved_pos = fileobj.tell()
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001593 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001594 return func(name, "r", fileobj, **kwargs)
1595 except (ReadError, CompressionError) as e:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001596 if fileobj is not None:
1597 fileobj.seek(saved_pos)
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001598 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001599 raise ReadError("file could not be opened successfully")
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001600
1601 elif ":" in mode:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001602 filemode, comptype = mode.split(":", 1)
1603 filemode = filemode or "r"
1604 comptype = comptype or "tar"
1605
1606 # Select the *open() function according to
1607 # given compression.
1608 if comptype in cls.OPEN_METH:
1609 func = getattr(cls, cls.OPEN_METH[comptype])
1610 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001611 raise CompressionError("unknown compression type %r" % comptype)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001612 return func(name, filemode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001613
1614 elif "|" in mode:
1615 filemode, comptype = mode.split("|", 1)
1616 filemode = filemode or "r"
1617 comptype = comptype or "tar"
1618
1619 if filemode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001620 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001621
Antoine Pitrou605c2932010-09-23 20:15:14 +00001622 stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1623 try:
1624 t = cls(name, filemode, stream, **kwargs)
1625 except:
1626 stream.close()
1627 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001628 t._extfileobj = False
1629 return t
1630
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001631 elif mode in "aw":
Guido van Rossumd8faa362007-04-27 19:54:29 +00001632 return cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001633
Thomas Wouters477c8d52006-05-27 19:21:47 +00001634 raise ValueError("undiscernible mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001635
Guido van Rossum75b64e62005-01-16 00:16:11 +00001636 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001637 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001638 """Open uncompressed tar archive name for reading or writing.
1639 """
1640 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001641 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001642 return cls(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001643
Guido van Rossum75b64e62005-01-16 00:16:11 +00001644 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001645 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001646 """Open gzip compressed tar archive name for reading or writing.
1647 Appending is not allowed.
1648 """
1649 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001650 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001651
1652 try:
1653 import gzip
Neal Norwitz4ec68242003-04-11 03:05:56 +00001654 gzip.GzipFile
1655 except (ImportError, AttributeError):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001656 raise CompressionError("gzip module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001657
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001658 extfileobj = fileobj is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001659 try:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001660 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
1661 t = cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001662 except IOError:
Georg Brandl3abb3722011-08-13 11:48:12 +02001663 if not extfileobj and fileobj is not None:
Antoine Pitrou95f55602010-09-23 18:36:46 +00001664 fileobj.close()
Georg Brandl3abb3722011-08-13 11:48:12 +02001665 if fileobj is None:
1666 raise
Thomas Wouters477c8d52006-05-27 19:21:47 +00001667 raise ReadError("not a gzip file")
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001668 except:
Georg Brandl3abb3722011-08-13 11:48:12 +02001669 if not extfileobj and fileobj is not None:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001670 fileobj.close()
1671 raise
Antoine Pitrou95f55602010-09-23 18:36:46 +00001672 t._extfileobj = extfileobj
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001673 return t
1674
Guido van Rossum75b64e62005-01-16 00:16:11 +00001675 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001676 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001677 """Open bzip2 compressed tar archive name for reading or writing.
1678 Appending is not allowed.
1679 """
1680 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001681 raise ValueError("mode must be 'r' or 'w'.")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001682
1683 try:
1684 import bz2
1685 except ImportError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001686 raise CompressionError("bz2 module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001687
Lars Gustäbelbb44b732011-12-06 13:44:10 +01001688 fileobj = bz2.BZ2File(filename=name if fileobj is None else None,
1689 mode=mode, fileobj=fileobj, compresslevel=compresslevel)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001690
1691 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001692 t = cls.taropen(name, mode, fileobj, **kwargs)
Lars Gustäbel9520a432009-11-22 18:48:49 +00001693 except (IOError, EOFError):
Antoine Pitrou95f55602010-09-23 18:36:46 +00001694 fileobj.close()
Thomas Wouters477c8d52006-05-27 19:21:47 +00001695 raise ReadError("not a bzip2 file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001696 t._extfileobj = False
1697 return t
1698
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001699 @classmethod
Lars Gustäbelc5e11992012-01-18 14:01:17 +01001700 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001701 """Open lzma compressed tar archive name for reading or writing.
1702 Appending is not allowed.
1703 """
1704 if mode not in ("r", "w"):
1705 raise ValueError("mode must be 'r' or 'w'")
1706
1707 try:
1708 import lzma
1709 except ImportError:
1710 raise CompressionError("lzma module is not available")
1711
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001712 fileobj = lzma.LZMAFile(filename=name if fileobj is None else None,
1713 mode=mode, fileobj=fileobj, preset=preset)
1714
1715 try:
1716 t = cls.taropen(name, mode, fileobj, **kwargs)
1717 except (lzma.LZMAError, EOFError):
1718 fileobj.close()
1719 raise ReadError("not an lzma file")
1720 t._extfileobj = False
1721 return t
1722
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001723 # All *open() methods are registered here.
1724 OPEN_METH = {
1725 "tar": "taropen", # uncompressed tar
1726 "gz": "gzopen", # gzip compressed tar
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001727 "bz2": "bz2open", # bzip2 compressed tar
1728 "xz": "xzopen" # lzma compressed tar
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001729 }
1730
1731 #--------------------------------------------------------------------------
1732 # The public methods which TarFile provides:
1733
1734 def close(self):
1735 """Close the TarFile. In write-mode, two finishing zero blocks are
1736 appended to the archive.
1737 """
1738 if self.closed:
1739 return
1740
Guido van Rossumd8faa362007-04-27 19:54:29 +00001741 if self.mode in "aw":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001742 self.fileobj.write(NUL * (BLOCKSIZE * 2))
1743 self.offset += (BLOCKSIZE * 2)
1744 # fill up the end with zero-blocks
1745 # (like option -b20 for tar does)
1746 blocks, remainder = divmod(self.offset, RECORDSIZE)
1747 if remainder > 0:
1748 self.fileobj.write(NUL * (RECORDSIZE - remainder))
1749
1750 if not self._extfileobj:
1751 self.fileobj.close()
1752 self.closed = True
1753
1754 def getmember(self, name):
1755 """Return a TarInfo object for member `name'. If `name' can not be
1756 found in the archive, KeyError is raised. If a member occurs more
Mark Dickinson934896d2009-02-21 20:59:32 +00001757 than once in the archive, its last occurrence is assumed to be the
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001758 most up-to-date version.
1759 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001760 tarinfo = self._getmember(name)
1761 if tarinfo is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001762 raise KeyError("filename %r not found" % name)
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001763 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001764
1765 def getmembers(self):
1766 """Return the members of the archive as a list of TarInfo objects. The
1767 list has the same order as the members in the archive.
1768 """
1769 self._check()
1770 if not self._loaded: # if we want to obtain a list of
1771 self._load() # all members, we first have to
1772 # scan the whole archive.
1773 return self.members
1774
1775 def getnames(self):
1776 """Return the members of the archive as a list of their names. It has
1777 the same order as the list returned by getmembers().
1778 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001779 return [tarinfo.name for tarinfo in self.getmembers()]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001780
1781 def gettarinfo(self, name=None, arcname=None, fileobj=None):
1782 """Create a TarInfo object for either the file `name' or the file
1783 object `fileobj' (using os.fstat on its file descriptor). You can
1784 modify some of the TarInfo's attributes before you add it using
1785 addfile(). If given, `arcname' specifies an alternative name for the
1786 file in the archive.
1787 """
1788 self._check("aw")
1789
1790 # When fileobj is given, replace name by
1791 # fileobj's real name.
1792 if fileobj is not None:
1793 name = fileobj.name
1794
1795 # Building the name of the member in the archive.
1796 # Backward slashes are converted to forward slashes,
1797 # Absolute paths are turned to relative paths.
1798 if arcname is None:
1799 arcname = name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001800 drv, arcname = os.path.splitdrive(arcname)
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00001801 arcname = arcname.replace(os.sep, "/")
1802 arcname = arcname.lstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001803
1804 # Now, fill the TarInfo object with
1805 # information specific for the file.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001806 tarinfo = self.tarinfo()
1807 tarinfo.tarfile = self
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001808
1809 # Use os.stat or os.lstat, depending on platform
1810 # and if symlinks shall be resolved.
1811 if fileobj is None:
1812 if hasattr(os, "lstat") and not self.dereference:
1813 statres = os.lstat(name)
1814 else:
1815 statres = os.stat(name)
1816 else:
1817 statres = os.fstat(fileobj.fileno())
1818 linkname = ""
1819
1820 stmd = statres.st_mode
1821 if stat.S_ISREG(stmd):
1822 inode = (statres.st_ino, statres.st_dev)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001823 if not self.dereference and statres.st_nlink > 1 and \
1824 inode in self.inodes and arcname != self.inodes[inode]:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001825 # Is it a hardlink to an already
1826 # archived file?
1827 type = LNKTYPE
1828 linkname = self.inodes[inode]
1829 else:
1830 # The inode is added only if its valid.
1831 # For win32 it is always 0.
1832 type = REGTYPE
1833 if inode[0]:
1834 self.inodes[inode] = arcname
1835 elif stat.S_ISDIR(stmd):
1836 type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001837 elif stat.S_ISFIFO(stmd):
1838 type = FIFOTYPE
1839 elif stat.S_ISLNK(stmd):
1840 type = SYMTYPE
1841 linkname = os.readlink(name)
1842 elif stat.S_ISCHR(stmd):
1843 type = CHRTYPE
1844 elif stat.S_ISBLK(stmd):
1845 type = BLKTYPE
1846 else:
1847 return None
1848
1849 # Fill the TarInfo object with all
1850 # information we can get.
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001851 tarinfo.name = arcname
1852 tarinfo.mode = stmd
1853 tarinfo.uid = statres.st_uid
1854 tarinfo.gid = statres.st_gid
Lars Gustäbel2470ff12010-06-03 10:11:52 +00001855 if type == REGTYPE:
Martin v. Löwis61d77e02004-08-20 06:35:46 +00001856 tarinfo.size = statres.st_size
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001857 else:
Guido van Rossume2a383d2007-01-15 16:59:06 +00001858 tarinfo.size = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001859 tarinfo.mtime = statres.st_mtime
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001860 tarinfo.type = type
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001861 tarinfo.linkname = linkname
1862 if pwd:
1863 try:
1864 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1865 except KeyError:
1866 pass
1867 if grp:
1868 try:
1869 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1870 except KeyError:
1871 pass
1872
1873 if type in (CHRTYPE, BLKTYPE):
1874 if hasattr(os, "major") and hasattr(os, "minor"):
1875 tarinfo.devmajor = os.major(statres.st_rdev)
1876 tarinfo.devminor = os.minor(statres.st_rdev)
1877 return tarinfo
1878
1879 def list(self, verbose=True):
1880 """Print a table of contents to sys.stdout. If `verbose' is False, only
1881 the names of the members are printed. If it is True, an `ls -l'-like
1882 output is produced.
1883 """
1884 self._check()
1885
1886 for tarinfo in self:
1887 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001888 print(filemode(tarinfo.mode), end=' ')
1889 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
1890 tarinfo.gname or tarinfo.gid), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001891 if tarinfo.ischr() or tarinfo.isblk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001892 print("%10s" % ("%d,%d" \
1893 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001894 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001895 print("%10d" % tarinfo.size, end=' ')
1896 print("%d-%02d-%02d %02d:%02d:%02d" \
1897 % time.localtime(tarinfo.mtime)[:6], end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001898
Guido van Rossumd8faa362007-04-27 19:54:29 +00001899 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001900
1901 if verbose:
1902 if tarinfo.issym():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001903 print("->", tarinfo.linkname, end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001904 if tarinfo.islnk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001905 print("link to", tarinfo.linkname, end=' ')
1906 print()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001907
Raymond Hettingera63a3122011-01-26 20:34:14 +00001908 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001909 """Add the file `name' to the archive. `name' may be any type of file
1910 (directory, fifo, symbolic link, etc.). If given, `arcname'
1911 specifies an alternative name for the file in the archive.
1912 Directories are added recursively by default. This can be avoided by
Guido van Rossum486364b2007-06-30 05:01:58 +00001913 setting `recursive' to False. `exclude' is a function that should
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001914 return True for each filename to be excluded. `filter' is a function
1915 that expects a TarInfo object argument and returns the changed
1916 TarInfo object, if it returns None the TarInfo object will be
1917 excluded from the archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001918 """
1919 self._check("aw")
1920
1921 if arcname is None:
1922 arcname = name
1923
Guido van Rossum486364b2007-06-30 05:01:58 +00001924 # Exclude pathnames.
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001925 if exclude is not None:
1926 import warnings
1927 warnings.warn("use the filter argument instead",
1928 DeprecationWarning, 2)
1929 if exclude(name):
1930 self._dbg(2, "tarfile: Excluded %r" % name)
1931 return
Guido van Rossum486364b2007-06-30 05:01:58 +00001932
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001933 # Skip if somebody tries to archive the archive...
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001934 if self.name is not None and os.path.abspath(name) == self.name:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001935 self._dbg(2, "tarfile: Skipped %r" % name)
1936 return
1937
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001938 self._dbg(1, name)
1939
1940 # Create a TarInfo object from the file.
1941 tarinfo = self.gettarinfo(name, arcname)
1942
1943 if tarinfo is None:
1944 self._dbg(1, "tarfile: Unsupported type %r" % name)
1945 return
1946
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001947 # Change or exclude the TarInfo object.
1948 if filter is not None:
1949 tarinfo = filter(tarinfo)
1950 if tarinfo is None:
1951 self._dbg(2, "tarfile: Excluded %r" % name)
1952 return
1953
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001954 # Append the tar header and data to the archive.
1955 if tarinfo.isreg():
Guido van Rossume7ba4952007-06-06 23:52:48 +00001956 f = bltn_open(name, "rb")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001957 self.addfile(tarinfo, f)
1958 f.close()
1959
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001960 elif tarinfo.isdir():
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001961 self.addfile(tarinfo)
1962 if recursive:
1963 for f in os.listdir(name):
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001964 self.add(os.path.join(name, f), os.path.join(arcname, f),
Raymond Hettingera63a3122011-01-26 20:34:14 +00001965 recursive, exclude, filter=filter)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001966
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001967 else:
1968 self.addfile(tarinfo)
1969
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001970 def addfile(self, tarinfo, fileobj=None):
1971 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
1972 given, tarinfo.size bytes are read from it and added to the archive.
1973 You can create TarInfo objects using gettarinfo().
1974 On Windows platforms, `fileobj' should always be opened with mode
1975 'rb' to avoid irritation about the file size.
1976 """
1977 self._check("aw")
1978
Thomas Wouters89f507f2006-12-13 04:49:30 +00001979 tarinfo = copy.copy(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001980
Guido van Rossume7ba4952007-06-06 23:52:48 +00001981 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001982 self.fileobj.write(buf)
1983 self.offset += len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001984
1985 # If there's data to follow, append it.
1986 if fileobj is not None:
1987 copyfileobj(fileobj, self.fileobj, tarinfo.size)
1988 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
1989 if remainder > 0:
1990 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
1991 blocks += 1
1992 self.offset += blocks * BLOCKSIZE
1993
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001994 self.members.append(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001995
Martin v. Löwis00a73e72005-03-04 19:40:34 +00001996 def extractall(self, path=".", members=None):
1997 """Extract all members from the archive to the current working
1998 directory and set owner, modification time and permissions on
1999 directories afterwards. `path' specifies a different directory
2000 to extract to. `members' is optional and must be a subset of the
2001 list returned by getmembers().
2002 """
2003 directories = []
2004
2005 if members is None:
2006 members = self
2007
2008 for tarinfo in members:
2009 if tarinfo.isdir():
Christian Heimes2202f872008-02-06 14:31:34 +00002010 # Extract directories with a safe mode.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002011 directories.append(tarinfo)
Christian Heimes2202f872008-02-06 14:31:34 +00002012 tarinfo = copy.copy(tarinfo)
2013 tarinfo.mode = 0o700
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002014 # Do not set_attrs directories, as we will do that further down
2015 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002016
2017 # Reverse sort directories.
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +00002018 directories.sort(key=lambda a: a.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002019 directories.reverse()
2020
2021 # Set correct owner, mtime and filemode on directories.
2022 for tarinfo in directories:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002023 dirpath = os.path.join(path, tarinfo.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002024 try:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002025 self.chown(tarinfo, dirpath)
2026 self.utime(tarinfo, dirpath)
2027 self.chmod(tarinfo, dirpath)
Guido van Rossumb940e112007-01-10 16:19:56 +00002028 except ExtractError as e:
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002029 if self.errorlevel > 1:
2030 raise
2031 else:
2032 self._dbg(1, "tarfile: %s" % e)
2033
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002034 def extract(self, member, path="", set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002035 """Extract a member from the archive to the current working directory,
2036 using its full name. Its file information is extracted as accurately
2037 as possible. `member' may be a filename or a TarInfo object. You can
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002038 specify a different directory using `path'. File attributes (owner,
2039 mtime, mode) are set unless `set_attrs' is False.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002040 """
2041 self._check("r")
2042
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002043 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002044 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002045 else:
2046 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002047
Neal Norwitza4f651a2004-07-20 22:07:44 +00002048 # Prepare the link target for makelink().
2049 if tarinfo.islnk():
2050 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2051
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002052 try:
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002053 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2054 set_attrs=set_attrs)
Guido van Rossumb940e112007-01-10 16:19:56 +00002055 except EnvironmentError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002056 if self.errorlevel > 0:
2057 raise
2058 else:
2059 if e.filename is None:
2060 self._dbg(1, "tarfile: %s" % e.strerror)
2061 else:
2062 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
Guido van Rossumb940e112007-01-10 16:19:56 +00002063 except ExtractError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002064 if self.errorlevel > 1:
2065 raise
2066 else:
2067 self._dbg(1, "tarfile: %s" % e)
2068
2069 def extractfile(self, member):
2070 """Extract a member from the archive as a file object. `member' may be
Lars Gustäbel7a919e92012-05-05 18:15:03 +02002071 a filename or a TarInfo object. If `member' is a regular file or a
2072 link, an io.BufferedReader object is returned. Otherwise, None is
2073 returned.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002074 """
2075 self._check("r")
2076
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002077 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002078 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002079 else:
2080 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002081
Lars Gustäbel7a919e92012-05-05 18:15:03 +02002082 if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES:
2083 # Members with unknown types are treated as regular files.
2084 if self.fileobject is None:
2085 fileobj = _FileInFile(self.fileobj, tarinfo.offset_data, tarinfo.size, tarinfo.sparse)
2086 return io.BufferedReader(fileobj)
2087 else:
2088 # Keep the traditional pre-3.3 API intact.
2089 return self.fileobject(self, tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002090
2091 elif tarinfo.islnk() or tarinfo.issym():
2092 if isinstance(self.fileobj, _Stream):
2093 # A small but ugly workaround for the case that someone tries
2094 # to extract a (sym)link as a file-object from a non-seekable
2095 # stream of tar blocks.
Thomas Wouters477c8d52006-05-27 19:21:47 +00002096 raise StreamError("cannot extract (sym)link as file object")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002097 else:
Georg Brandl7eb4b7d2005-07-22 21:49:32 +00002098 # A (sym)link's file object is its target's file object.
Lars Gustäbel1b512722010-06-03 12:45:16 +00002099 return self.extractfile(self._find_link_target(tarinfo))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002100 else:
2101 # If there's no data associated with the member (directory, chrdev,
2102 # blkdev, etc.), return None instead of a file object.
2103 return None
2104
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002105 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002106 """Extract the TarInfo object tarinfo to a physical
2107 file called targetpath.
2108 """
2109 # Fetch the TarInfo object for the given name
2110 # and build the destination pathname, replacing
2111 # forward slashes to platform specific separators.
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002112 targetpath = targetpath.rstrip("/")
2113 targetpath = targetpath.replace("/", os.sep)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002114
2115 # Create all upper directories.
2116 upperdirs = os.path.dirname(targetpath)
2117 if upperdirs and not os.path.exists(upperdirs):
Christian Heimes2202f872008-02-06 14:31:34 +00002118 # Create directories that are not part of the archive with
2119 # default permissions.
Thomas Woutersb2137042007-02-01 18:02:27 +00002120 os.makedirs(upperdirs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002121
2122 if tarinfo.islnk() or tarinfo.issym():
2123 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2124 else:
2125 self._dbg(1, tarinfo.name)
2126
2127 if tarinfo.isreg():
2128 self.makefile(tarinfo, targetpath)
2129 elif tarinfo.isdir():
2130 self.makedir(tarinfo, targetpath)
2131 elif tarinfo.isfifo():
2132 self.makefifo(tarinfo, targetpath)
2133 elif tarinfo.ischr() or tarinfo.isblk():
2134 self.makedev(tarinfo, targetpath)
2135 elif tarinfo.islnk() or tarinfo.issym():
2136 self.makelink(tarinfo, targetpath)
2137 elif tarinfo.type not in SUPPORTED_TYPES:
2138 self.makeunknown(tarinfo, targetpath)
2139 else:
2140 self.makefile(tarinfo, targetpath)
2141
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002142 if set_attrs:
2143 self.chown(tarinfo, targetpath)
2144 if not tarinfo.issym():
2145 self.chmod(tarinfo, targetpath)
2146 self.utime(tarinfo, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002147
2148 #--------------------------------------------------------------------------
2149 # Below are the different file methods. They are called via
2150 # _extract_member() when extract() is called. They can be replaced in a
2151 # subclass to implement other functionality.
2152
2153 def makedir(self, tarinfo, targetpath):
2154 """Make a directory called targetpath.
2155 """
2156 try:
Christian Heimes2202f872008-02-06 14:31:34 +00002157 # Use a safe mode for the directory, the real mode is set
2158 # later in _extract_member().
2159 os.mkdir(targetpath, 0o700)
Florent Xicluna68f71a32011-10-28 16:06:23 +02002160 except FileExistsError:
2161 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002162
2163 def makefile(self, tarinfo, targetpath):
2164 """Make a file called targetpath.
2165 """
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002166 source = self.fileobj
2167 source.seek(tarinfo.offset_data)
Guido van Rossume7ba4952007-06-06 23:52:48 +00002168 target = bltn_open(targetpath, "wb")
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002169 if tarinfo.sparse is not None:
2170 for offset, size in tarinfo.sparse:
2171 target.seek(offset)
2172 copyfileobj(source, target, size)
2173 else:
2174 copyfileobj(source, target, tarinfo.size)
2175 target.seek(tarinfo.size)
2176 target.truncate()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002177 target.close()
2178
2179 def makeunknown(self, tarinfo, targetpath):
2180 """Make a file from a TarInfo object with an unknown type
2181 at targetpath.
2182 """
2183 self.makefile(tarinfo, targetpath)
2184 self._dbg(1, "tarfile: Unknown file type %r, " \
2185 "extracted as regular file." % tarinfo.type)
2186
2187 def makefifo(self, tarinfo, targetpath):
2188 """Make a fifo called targetpath.
2189 """
2190 if hasattr(os, "mkfifo"):
2191 os.mkfifo(targetpath)
2192 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002193 raise ExtractError("fifo not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002194
2195 def makedev(self, tarinfo, targetpath):
2196 """Make a character or block device called targetpath.
2197 """
2198 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00002199 raise ExtractError("special devices not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002200
2201 mode = tarinfo.mode
2202 if tarinfo.isblk():
2203 mode |= stat.S_IFBLK
2204 else:
2205 mode |= stat.S_IFCHR
2206
2207 os.mknod(targetpath, mode,
2208 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2209
2210 def makelink(self, tarinfo, targetpath):
2211 """Make a (symbolic) link called targetpath. If it cannot be created
2212 (platform limitation), we try to make a copy of the referenced file
2213 instead of a link.
2214 """
Brian Curtind40e6f72010-07-08 21:39:08 +00002215 try:
Lars Gustäbel1b512722010-06-03 12:45:16 +00002216 # For systems that support symbolic and hard links.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002217 if tarinfo.issym():
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002218 os.symlink(tarinfo.linkname, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002219 else:
Neal Norwitza4f651a2004-07-20 22:07:44 +00002220 # See extract().
Lars Gustäbel1b512722010-06-03 12:45:16 +00002221 if os.path.exists(tarinfo._link_target):
2222 os.link(tarinfo._link_target, targetpath)
2223 else:
Brian Curtin82df53e2010-09-24 21:04:05 +00002224 self._extract_member(self._find_link_target(tarinfo),
2225 targetpath)
Brian Curtin16633fa2010-07-09 13:54:27 +00002226 except symlink_exception:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002227 try:
Brian Curtin16633fa2010-07-09 13:54:27 +00002228 self._extract_member(self._find_link_target(tarinfo),
2229 targetpath)
Lars Gustäbel1b512722010-06-03 12:45:16 +00002230 except KeyError:
2231 raise ExtractError("unable to resolve link inside archive")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002232
2233 def chown(self, tarinfo, targetpath):
2234 """Set owner of targetpath according to tarinfo.
2235 """
2236 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2237 # We have to be root to do so.
2238 try:
2239 g = grp.getgrnam(tarinfo.gname)[2]
2240 except KeyError:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002241 g = tarinfo.gid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002242 try:
2243 u = pwd.getpwnam(tarinfo.uname)[2]
2244 except KeyError:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002245 u = tarinfo.uid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002246 try:
2247 if tarinfo.issym() and hasattr(os, "lchown"):
2248 os.lchown(targetpath, u, g)
2249 else:
Andrew MacIntyre7970d202003-02-19 12:51:34 +00002250 if sys.platform != "os2emx":
2251 os.chown(targetpath, u, g)
Guido van Rossumb940e112007-01-10 16:19:56 +00002252 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002253 raise ExtractError("could not change owner")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002254
2255 def chmod(self, tarinfo, targetpath):
2256 """Set file permissions of targetpath according to tarinfo.
2257 """
Jack Jansen834eff62003-03-07 12:47:06 +00002258 if hasattr(os, 'chmod'):
2259 try:
2260 os.chmod(targetpath, tarinfo.mode)
Guido van Rossumb940e112007-01-10 16:19:56 +00002261 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002262 raise ExtractError("could not change mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002263
2264 def utime(self, tarinfo, targetpath):
2265 """Set modification time of targetpath according to tarinfo.
2266 """
Jack Jansen834eff62003-03-07 12:47:06 +00002267 if not hasattr(os, 'utime'):
Tim Petersf9347782003-03-07 15:36:41 +00002268 return
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002269 try:
2270 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
Guido van Rossumb940e112007-01-10 16:19:56 +00002271 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002272 raise ExtractError("could not change modification time")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002273
2274 #--------------------------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002275 def next(self):
2276 """Return the next member of the archive as a TarInfo object, when
2277 TarFile is opened for reading. Return None if there is no more
2278 available.
2279 """
2280 self._check("ra")
2281 if self.firstmember is not None:
2282 m = self.firstmember
2283 self.firstmember = None
2284 return m
2285
2286 # Read the next block.
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002287 self.fileobj.seek(self.offset)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002288 tarinfo = None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002289 while True:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002290 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00002291 tarinfo = self.tarinfo.fromtarfile(self)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002292 except EOFHeaderError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002293 if self.ignore_zeros:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002294 self._dbg(2, "0x%X: %s" % (self.offset, e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002295 self.offset += BLOCKSIZE
2296 continue
Lars Gustäbel9520a432009-11-22 18:48:49 +00002297 except InvalidHeaderError as e:
2298 if self.ignore_zeros:
2299 self._dbg(2, "0x%X: %s" % (self.offset, e))
2300 self.offset += BLOCKSIZE
2301 continue
2302 elif self.offset == 0:
2303 raise ReadError(str(e))
2304 except EmptyHeaderError:
2305 if self.offset == 0:
2306 raise ReadError("empty file")
2307 except TruncatedHeaderError as e:
2308 if self.offset == 0:
2309 raise ReadError(str(e))
2310 except SubsequentHeaderError as e:
2311 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002312 break
2313
Lars Gustäbel9520a432009-11-22 18:48:49 +00002314 if tarinfo is not None:
2315 self.members.append(tarinfo)
2316 else:
2317 self._loaded = True
2318
Thomas Wouters477c8d52006-05-27 19:21:47 +00002319 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002320
2321 #--------------------------------------------------------------------------
2322 # Little helper methods:
2323
Lars Gustäbel1b512722010-06-03 12:45:16 +00002324 def _getmember(self, name, tarinfo=None, normalize=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002325 """Find an archive member by name from bottom to top.
2326 If tarinfo is given, it is used as the starting point.
2327 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002328 # Ensure that all members have been loaded.
2329 members = self.getmembers()
2330
Lars Gustäbel1b512722010-06-03 12:45:16 +00002331 # Limit the member search list up to tarinfo.
2332 if tarinfo is not None:
2333 members = members[:members.index(tarinfo)]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002334
Lars Gustäbel1b512722010-06-03 12:45:16 +00002335 if normalize:
2336 name = os.path.normpath(name)
2337
2338 for member in reversed(members):
2339 if normalize:
2340 member_name = os.path.normpath(member.name)
2341 else:
2342 member_name = member.name
2343
2344 if name == member_name:
2345 return member
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002346
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002347 def _load(self):
2348 """Read through the entire archive file and look for readable
2349 members.
2350 """
2351 while True:
2352 tarinfo = self.next()
2353 if tarinfo is None:
2354 break
2355 self._loaded = True
2356
2357 def _check(self, mode=None):
2358 """Check if TarFile is still open, and if the operation's mode
2359 corresponds to TarFile's mode.
2360 """
2361 if self.closed:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002362 raise IOError("%s is closed" % self.__class__.__name__)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002363 if mode is not None and self.mode not in mode:
2364 raise IOError("bad operation for mode %r" % self.mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002365
Lars Gustäbel1b512722010-06-03 12:45:16 +00002366 def _find_link_target(self, tarinfo):
2367 """Find the target member of a symlink or hardlink member in the
2368 archive.
2369 """
2370 if tarinfo.issym():
2371 # Always search the entire archive.
Lars Gustäbel1ef9eda2012-04-24 21:04:40 +02002372 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
Lars Gustäbel1b512722010-06-03 12:45:16 +00002373 limit = None
2374 else:
2375 # Search the archive before the link, because a hard link is
2376 # just a reference to an already archived file.
2377 linkname = tarinfo.linkname
2378 limit = tarinfo
2379
2380 member = self._getmember(linkname, tarinfo=limit, normalize=True)
2381 if member is None:
2382 raise KeyError("linkname %r not found" % linkname)
2383 return member
2384
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002385 def __iter__(self):
2386 """Provide an iterator object.
2387 """
2388 if self._loaded:
2389 return iter(self.members)
2390 else:
2391 return TarIter(self)
2392
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002393 def _dbg(self, level, msg):
2394 """Write debugging output to sys.stderr.
2395 """
2396 if level <= self.debug:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002397 print(msg, file=sys.stderr)
Lars Gustäbel01385812010-03-03 12:08:54 +00002398
2399 def __enter__(self):
2400 self._check()
2401 return self
2402
2403 def __exit__(self, type, value, traceback):
2404 if type is None:
2405 self.close()
2406 else:
2407 # An exception occurred. We must not call close() because
2408 # it would try to write end-of-archive blocks and padding.
2409 if not self._extfileobj:
2410 self.fileobj.close()
2411 self.closed = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002412# class TarFile
2413
2414class TarIter:
2415 """Iterator Class.
2416
2417 for tarinfo in TarFile(...):
2418 suite...
2419 """
2420
2421 def __init__(self, tarfile):
2422 """Construct a TarIter object.
2423 """
2424 self.tarfile = tarfile
Martin v. Löwis637431b2005-03-03 23:12:42 +00002425 self.index = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002426 def __iter__(self):
2427 """Return iterator object.
2428 """
2429 return self
Georg Brandla18af4e2007-04-21 15:47:16 +00002430 def __next__(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002431 """Return the next item using TarFile's next() method.
2432 When all members have been read, set TarFile as _loaded.
2433 """
Martin v. Löwis637431b2005-03-03 23:12:42 +00002434 # Fix for SF #1100429: Under rare circumstances it can
2435 # happen that getmembers() is called during iteration,
2436 # which will cause TarIter to stop prematurely.
2437 if not self.tarfile._loaded:
2438 tarinfo = self.tarfile.next()
2439 if not tarinfo:
2440 self.tarfile._loaded = True
2441 raise StopIteration
2442 else:
2443 try:
2444 tarinfo = self.tarfile.members[self.index]
2445 except IndexError:
2446 raise StopIteration
2447 self.index += 1
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002448 return tarinfo
2449
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002450#--------------------
2451# exported functions
2452#--------------------
2453def is_tarfile(name):
2454 """Return True if name points to a tar archive that we
2455 are able to handle, else return False.
2456 """
2457 try:
2458 t = open(name)
2459 t.close()
2460 return True
2461 except TarError:
2462 return False
2463
Guido van Rossume7ba4952007-06-06 23:52:48 +00002464bltn_open = open
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002465open = TarFile.open