blob: 5c9d3a8f3558501d9cb39d63ca88e3365180f218 [file] [log] [blame]
Benjamin Peterson90f5ba52010-03-11 22:53:45 +00001#!/usr/bin/env python3
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
Christian Heimes9c1257e2007-11-04 11:37:22 +00005# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00006# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
Guido van Rossumd8faa362007-04-27 19:54:29 +000032version = "0.9.0"
Guido van Rossum98297ee2007-11-06 21:34:58 +000033__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)"
Senthil Kumaran7c9719c2011-07-28 22:32:49 +080034__date__ = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $"
35__cvsid__ = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $"
Guido van Rossum98297ee2007-11-06 21:34:58 +000036__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000037
38#---------
39# Imports
40#---------
41import sys
42import os
Eli Bendersky74c503b2012-01-03 06:26:13 +020043import io
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000044import shutil
45import stat
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000046import time
47import struct
Thomas Wouters89f507f2006-12-13 04:49:30 +000048import copy
Guido van Rossumd8faa362007-04-27 19:54:29 +000049import re
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000050
51try:
52 import grp, pwd
53except ImportError:
54 grp = pwd = None
55
Brian Curtin16633fa2010-07-09 13:54:27 +000056# os.symlink on Windows prior to 6.0 raises NotImplementedError
57symlink_exception = (AttributeError, NotImplementedError)
58try:
59 # WindowsError (1314) will be raised if the caller does not hold the
60 # SeCreateSymbolicLinkPrivilege privilege
61 symlink_exception += (WindowsError,)
62except NameError:
63 pass
64
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000065# from tarfile import *
66__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
67
Georg Brandl1a3284e2007-12-02 09:40:06 +000068from builtins import open as _open # Since 'open' is TarFile.open
Guido van Rossum8f78fe92006-08-24 04:03:53 +000069
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000070#---------------------------------------------------------
71# tar constants
72#---------------------------------------------------------
Lars Gustäbelb506dc32007-08-07 18:36:16 +000073NUL = b"\0" # the null character
Guido van Rossumd8faa362007-04-27 19:54:29 +000074BLOCKSIZE = 512 # length of processing blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000075RECORDSIZE = BLOCKSIZE * 20 # length of records
Lars Gustäbelb506dc32007-08-07 18:36:16 +000076GNU_MAGIC = b"ustar \0" # magic gnu tar string
77POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000078
Guido van Rossumd8faa362007-04-27 19:54:29 +000079LENGTH_NAME = 100 # maximum length of a filename
80LENGTH_LINK = 100 # maximum length of a linkname
81LENGTH_PREFIX = 155 # maximum length of the prefix field
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000082
Lars Gustäbelb506dc32007-08-07 18:36:16 +000083REGTYPE = b"0" # regular file
84AREGTYPE = b"\0" # regular file
85LNKTYPE = b"1" # link (inside tarfile)
86SYMTYPE = b"2" # symbolic link
87CHRTYPE = b"3" # character special device
88BLKTYPE = b"4" # block special device
89DIRTYPE = b"5" # directory
90FIFOTYPE = b"6" # fifo special device
91CONTTYPE = b"7" # contiguous file
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000092
Lars Gustäbelb506dc32007-08-07 18:36:16 +000093GNUTYPE_LONGNAME = b"L" # GNU tar longname
94GNUTYPE_LONGLINK = b"K" # GNU tar longlink
95GNUTYPE_SPARSE = b"S" # GNU tar sparse file
Guido van Rossumd8faa362007-04-27 19:54:29 +000096
Lars Gustäbelb506dc32007-08-07 18:36:16 +000097XHDTYPE = b"x" # POSIX.1-2001 extended header
98XGLTYPE = b"g" # POSIX.1-2001 global header
99SOLARIS_XHDTYPE = b"X" # Solaris extended header
Guido van Rossumd8faa362007-04-27 19:54:29 +0000100
101USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
102GNU_FORMAT = 1 # GNU tar format
103PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
104DEFAULT_FORMAT = GNU_FORMAT
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000105
106#---------------------------------------------------------
107# tarfile constants
108#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000109# File types that tarfile supports:
110SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
111 SYMTYPE, DIRTYPE, FIFOTYPE,
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000112 CONTTYPE, CHRTYPE, BLKTYPE,
113 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
114 GNUTYPE_SPARSE)
115
Guido van Rossumd8faa362007-04-27 19:54:29 +0000116# File types that will be treated as a regular file.
117REGULAR_TYPES = (REGTYPE, AREGTYPE,
118 CONTTYPE, GNUTYPE_SPARSE)
119
120# File types that are part of the GNU tar format.
121GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
122 GNUTYPE_SPARSE)
123
124# Fields from a pax header that override a TarInfo attribute.
125PAX_FIELDS = ("path", "linkpath", "size", "mtime",
126 "uid", "gid", "uname", "gname")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000127
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000128# Fields from a pax header that are affected by hdrcharset.
129PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
130
Guido van Rossume7ba4952007-06-06 23:52:48 +0000131# Fields in a pax header that are numbers, all other fields
132# are treated as strings.
133PAX_NUMBER_FIELDS = {
134 "atime": float,
135 "ctime": float,
136 "mtime": float,
137 "uid": int,
138 "gid": int,
139 "size": int
140}
141
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000142#---------------------------------------------------------
143# Bits used in the mode field, values in octal.
144#---------------------------------------------------------
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000145S_IFLNK = 0o120000 # symbolic link
146S_IFREG = 0o100000 # regular file
147S_IFBLK = 0o060000 # block device
148S_IFDIR = 0o040000 # directory
149S_IFCHR = 0o020000 # character device
150S_IFIFO = 0o010000 # fifo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000151
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000152TSUID = 0o4000 # set UID on execution
153TSGID = 0o2000 # set GID on execution
154TSVTX = 0o1000 # reserved
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000155
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000156TUREAD = 0o400 # read by owner
157TUWRITE = 0o200 # write by owner
158TUEXEC = 0o100 # execute/search by owner
159TGREAD = 0o040 # read by group
160TGWRITE = 0o020 # write by group
161TGEXEC = 0o010 # execute/search by group
162TOREAD = 0o004 # read by other
163TOWRITE = 0o002 # write by other
164TOEXEC = 0o001 # execute/search by other
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000165
166#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000167# initialization
168#---------------------------------------------------------
Victor Stinner0f35e2c2010-06-11 23:46:47 +0000169if os.name in ("nt", "ce"):
170 ENCODING = "utf-8"
171else:
172 ENCODING = sys.getfilesystemencoding()
Guido van Rossumd8faa362007-04-27 19:54:29 +0000173
174#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000175# Some useful functions
176#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000177
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000178def stn(s, length, encoding, errors):
179 """Convert a string to a null-terminated bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000180 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000181 s = s.encode(encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000182 return s[:length] + (length - len(s)) * NUL
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000184def nts(s, encoding, errors):
185 """Convert a null-terminated bytes object to a string.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000186 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000187 p = s.find(b"\0")
188 if p != -1:
189 s = s[:p]
190 return s.decode(encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000191
Thomas Wouters477c8d52006-05-27 19:21:47 +0000192def nti(s):
193 """Convert a number field to a python number.
194 """
195 # There are two possible encodings for a number field, see
196 # itn() below.
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200197 if s[0] in (0o200, 0o377):
198 n = 0
199 for i in range(len(s) - 1):
200 n <<= 8
201 n += s[i + 1]
202 if s[0] == 0o377:
203 n = -(256 ** (len(s) - 1) - n)
204 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000205 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000206 n = int(nts(s, "ascii", "strict") or "0", 8)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000207 except ValueError:
Lars Gustäbel9520a432009-11-22 18:48:49 +0000208 raise InvalidHeaderError("invalid header")
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209 return n
210
Guido van Rossumd8faa362007-04-27 19:54:29 +0000211def itn(n, digits=8, format=DEFAULT_FORMAT):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000212 """Convert a python number to a number field.
213 """
214 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
215 # octal digits followed by a null-byte, this allows values up to
216 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200217 # that if necessary. A leading 0o200 or 0o377 byte indicate this
218 # particular encoding, the following digits-1 bytes are a big-endian
219 # base-256 representation. This allows values up to (256**(digits-1))-1.
220 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
221 # number.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000222 if 0 <= n < 8 ** (digits - 1):
Lars Gustäbela280ca752007-08-28 07:34:33 +0000223 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200224 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
225 if n >= 0:
226 s = bytearray([0o200])
227 else:
228 s = bytearray([0o377])
229 n = 256 ** digits + n
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230
Guido van Rossum805365e2007-05-07 22:24:25 +0000231 for i in range(digits - 1):
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200232 s.insert(1, n & 0o377)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233 n >>= 8
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200234 else:
235 raise ValueError("overflow in number field")
236
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237 return s
238
239def calc_chksums(buf):
240 """Calculate the checksum for a member's header by summing up all
241 characters except for the chksum field which is treated as if
242 it was filled with spaces. According to the GNU tar sources,
243 some tars (Sun and NeXT) calculate chksum with signed char,
244 which will be different if there are chars in the buffer with
245 the high bit set. So we calculate two checksums, unsigned and
246 signed.
247 """
248 unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
249 signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
250 return unsigned_chksum, signed_chksum
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000251
252def copyfileobj(src, dst, length=None):
253 """Copy length bytes from fileobj src to fileobj dst.
254 If length is None, copy the entire content.
255 """
256 if length == 0:
257 return
258 if length is None:
259 shutil.copyfileobj(src, dst)
260 return
261
262 BUFSIZE = 16 * 1024
263 blocks, remainder = divmod(length, BUFSIZE)
Guido van Rossum805365e2007-05-07 22:24:25 +0000264 for b in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000265 buf = src.read(BUFSIZE)
266 if len(buf) < BUFSIZE:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000268 dst.write(buf)
269
270 if remainder != 0:
271 buf = src.read(remainder)
272 if len(buf) < remainder:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000274 dst.write(buf)
275 return
276
277filemode_table = (
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000278 ((S_IFLNK, "l"),
279 (S_IFREG, "-"),
280 (S_IFBLK, "b"),
281 (S_IFDIR, "d"),
282 (S_IFCHR, "c"),
283 (S_IFIFO, "p")),
284
285 ((TUREAD, "r"),),
286 ((TUWRITE, "w"),),
287 ((TUEXEC|TSUID, "s"),
288 (TSUID, "S"),
289 (TUEXEC, "x")),
290
291 ((TGREAD, "r"),),
292 ((TGWRITE, "w"),),
293 ((TGEXEC|TSGID, "s"),
294 (TSGID, "S"),
295 (TGEXEC, "x")),
296
297 ((TOREAD, "r"),),
298 ((TOWRITE, "w"),),
299 ((TOEXEC|TSVTX, "t"),
300 (TSVTX, "T"),
301 (TOEXEC, "x"))
302)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000303
304def filemode(mode):
305 """Convert a file's mode to a string of the form
306 -rwxrwxrwx.
307 Used by TarFile.list()
308 """
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000309 perm = []
310 for table in filemode_table:
311 for bit, char in table:
312 if mode & bit == bit:
313 perm.append(char)
314 break
315 else:
316 perm.append("-")
317 return "".join(perm)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000318
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000319class TarError(Exception):
320 """Base exception."""
321 pass
322class ExtractError(TarError):
323 """General exception for extract errors."""
324 pass
325class ReadError(TarError):
326 """Exception for unreadble tar archives."""
327 pass
328class CompressionError(TarError):
329 """Exception for unavailable compression methods."""
330 pass
331class StreamError(TarError):
332 """Exception for unsupported operations on stream-like TarFiles."""
333 pass
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000334class HeaderError(TarError):
Lars Gustäbel9520a432009-11-22 18:48:49 +0000335 """Base exception for header errors."""
336 pass
337class EmptyHeaderError(HeaderError):
338 """Exception for empty headers."""
339 pass
340class TruncatedHeaderError(HeaderError):
341 """Exception for truncated headers."""
342 pass
343class EOFHeaderError(HeaderError):
344 """Exception for end of file headers."""
345 pass
346class InvalidHeaderError(HeaderError):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000347 """Exception for invalid headers."""
348 pass
Lars Gustäbel9520a432009-11-22 18:48:49 +0000349class SubsequentHeaderError(HeaderError):
350 """Exception for missing and invalid extended headers."""
351 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000352
353#---------------------------
354# internal stream interface
355#---------------------------
356class _LowLevelFile:
357 """Low-level file object. Supports reading and writing.
358 It is used instead of a regular file object for streaming
359 access.
360 """
361
362 def __init__(self, name, mode):
363 mode = {
364 "r": os.O_RDONLY,
365 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
366 }[mode]
367 if hasattr(os, "O_BINARY"):
368 mode |= os.O_BINARY
Lars Gustäbeld6eb70b2010-04-29 15:37:02 +0000369 self.fd = os.open(name, mode, 0o666)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000370
371 def close(self):
372 os.close(self.fd)
373
374 def read(self, size):
375 return os.read(self.fd, size)
376
377 def write(self, s):
378 os.write(self.fd, s)
379
380class _Stream:
381 """Class that serves as an adapter between TarFile and
382 a stream-like object. The stream-like object only
383 needs to have a read() or write() method and is accessed
384 blockwise. Use of gzip or bzip2 compression is possible.
385 A stream-like object could be for example: sys.stdin,
386 sys.stdout, a socket, a tape device etc.
387
388 _Stream is intended to be used only internally.
389 """
390
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000391 def __init__(self, name, mode, comptype, fileobj, bufsize):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000392 """Construct a _Stream object.
393 """
394 self._extfileobj = True
395 if fileobj is None:
396 fileobj = _LowLevelFile(name, mode)
397 self._extfileobj = False
398
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000399 if comptype == '*':
400 # Enable transparent compression detection for the
401 # stream interface
402 fileobj = _StreamProxy(fileobj)
403 comptype = fileobj.getcomptype()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000404
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000405 self.name = name or ""
406 self.mode = mode
407 self.comptype = comptype
408 self.fileobj = fileobj
409 self.bufsize = bufsize
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000410 self.buf = b""
Guido van Rossume2a383d2007-01-15 16:59:06 +0000411 self.pos = 0
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000412 self.closed = False
413
Antoine Pitrou605c2932010-09-23 20:15:14 +0000414 try:
415 if comptype == "gz":
416 try:
417 import zlib
418 except ImportError:
419 raise CompressionError("zlib module is not available")
420 self.zlib = zlib
421 self.crc = zlib.crc32(b"")
422 if mode == "r":
423 self._init_read_gz()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100424 self.exception = zlib.error
Antoine Pitrou605c2932010-09-23 20:15:14 +0000425 else:
426 self._init_write_gz()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000427
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100428 elif comptype == "bz2":
Antoine Pitrou605c2932010-09-23 20:15:14 +0000429 try:
430 import bz2
431 except ImportError:
432 raise CompressionError("bz2 module is not available")
433 if mode == "r":
434 self.dbuf = b""
435 self.cmp = bz2.BZ2Decompressor()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100436 self.exception = IOError
Antoine Pitrou605c2932010-09-23 20:15:14 +0000437 else:
438 self.cmp = bz2.BZ2Compressor()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100439
440 elif comptype == "xz":
441 try:
442 import lzma
443 except ImportError:
444 raise CompressionError("lzma module is not available")
445 if mode == "r":
446 self.dbuf = b""
447 self.cmp = lzma.LZMADecompressor()
448 self.exception = lzma.LZMAError
449 else:
450 self.cmp = lzma.LZMACompressor()
451
452 elif comptype != "tar":
453 raise CompressionError("unknown compression type %r" % comptype)
454
Antoine Pitrou605c2932010-09-23 20:15:14 +0000455 except:
456 if not self._extfileobj:
457 self.fileobj.close()
458 self.closed = True
459 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000460
461 def __del__(self):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000462 if hasattr(self, "closed") and not self.closed:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000463 self.close()
464
465 def _init_write_gz(self):
466 """Initialize for writing with gzip compression.
467 """
468 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
469 -self.zlib.MAX_WBITS,
470 self.zlib.DEF_MEM_LEVEL,
471 0)
Guido van Rossume2a383d2007-01-15 16:59:06 +0000472 timestamp = struct.pack("<L", int(time.time()))
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000473 self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000474 if self.name.endswith(".gz"):
475 self.name = self.name[:-3]
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000476 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
477 self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000478
479 def write(self, s):
480 """Write string s to the stream.
481 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000482 if self.comptype == "gz":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000483 self.crc = self.zlib.crc32(s, self.crc)
484 self.pos += len(s)
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000485 if self.comptype != "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000486 s = self.cmp.compress(s)
487 self.__write(s)
488
489 def __write(self, s):
490 """Write string s to the stream if a whole new block
491 is ready to be written.
492 """
493 self.buf += s
494 while len(self.buf) > self.bufsize:
495 self.fileobj.write(self.buf[:self.bufsize])
496 self.buf = self.buf[self.bufsize:]
497
498 def close(self):
499 """Close the _Stream object. No operation should be
500 done on it afterwards.
501 """
502 if self.closed:
503 return
504
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000505 if self.mode == "w" and self.comptype != "tar":
Martin v. Löwisc234a522004-08-22 21:28:33 +0000506 self.buf += self.cmp.flush()
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000507
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000508 if self.mode == "w" and self.buf:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000509 self.fileobj.write(self.buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000510 self.buf = b""
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000511 if self.comptype == "gz":
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000512 # The native zlib crc is an unsigned 32-bit integer, but
513 # the Python wrapper implicitly casts that to a signed C
514 # long. So, on a 32-bit box self.crc may "look negative",
515 # while the same crc on a 64-bit box may "look positive".
516 # To avoid irksome warnings from the `struct` module, force
517 # it to look positive on all boxes.
Guido van Rossume2a383d2007-01-15 16:59:06 +0000518 self.fileobj.write(struct.pack("<L", self.crc & 0xffffffff))
519 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000520
521 if not self._extfileobj:
522 self.fileobj.close()
523
524 self.closed = True
525
526 def _init_read_gz(self):
527 """Initialize for reading a gzip compressed fileobj.
528 """
529 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000530 self.dbuf = b""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000531
532 # taken from gzip.GzipFile with some alterations
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000533 if self.__read(2) != b"\037\213":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000534 raise ReadError("not a gzip file")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000535 if self.__read(1) != b"\010":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000536 raise CompressionError("unsupported compression method")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000537
538 flag = ord(self.__read(1))
539 self.__read(6)
540
541 if flag & 4:
542 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
543 self.read(xlen)
544 if flag & 8:
545 while True:
546 s = self.__read(1)
547 if not s or s == NUL:
548 break
549 if flag & 16:
550 while True:
551 s = self.__read(1)
552 if not s or s == NUL:
553 break
554 if flag & 2:
555 self.__read(2)
556
557 def tell(self):
558 """Return the stream's file pointer position.
559 """
560 return self.pos
561
562 def seek(self, pos=0):
563 """Set the stream's file pointer to pos. Negative seeking
564 is forbidden.
565 """
566 if pos - self.pos >= 0:
567 blocks, remainder = divmod(pos - self.pos, self.bufsize)
Guido van Rossum805365e2007-05-07 22:24:25 +0000568 for i in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000569 self.read(self.bufsize)
570 self.read(remainder)
571 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572 raise StreamError("seeking backwards is not allowed")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000573 return self.pos
574
575 def read(self, size=None):
576 """Return the next size number of bytes from the stream.
577 If size is not defined, return all bytes of the stream
578 up to EOF.
579 """
580 if size is None:
581 t = []
582 while True:
583 buf = self._read(self.bufsize)
584 if not buf:
585 break
586 t.append(buf)
587 buf = "".join(t)
588 else:
589 buf = self._read(size)
590 self.pos += len(buf)
591 return buf
592
593 def _read(self, size):
594 """Return size bytes from the stream.
595 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000596 if self.comptype == "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000597 return self.__read(size)
598
599 c = len(self.dbuf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000600 while c < size:
601 buf = self.__read(self.bufsize)
602 if not buf:
603 break
Guido van Rossumd8faa362007-04-27 19:54:29 +0000604 try:
605 buf = self.cmp.decompress(buf)
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100606 except self.exception:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000607 raise ReadError("invalid compressed data")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000608 self.dbuf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000609 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000610 buf = self.dbuf[:size]
611 self.dbuf = self.dbuf[size:]
612 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000613
614 def __read(self, size):
615 """Return size bytes from stream. If internal buffer is empty,
616 read another block from the stream.
617 """
618 c = len(self.buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000619 while c < size:
620 buf = self.fileobj.read(self.bufsize)
621 if not buf:
622 break
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000623 self.buf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000624 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000625 buf = self.buf[:size]
626 self.buf = self.buf[size:]
627 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000628# class _Stream
629
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000630class _StreamProxy(object):
631 """Small proxy class that enables transparent compression
632 detection for the Stream interface (mode 'r|*').
633 """
634
635 def __init__(self, fileobj):
636 self.fileobj = fileobj
637 self.buf = self.fileobj.read(BLOCKSIZE)
638
639 def read(self, size):
640 self.read = self.fileobj.read
641 return self.buf
642
643 def getcomptype(self):
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100644 if self.buf.startswith(b"\x1f\x8b\x08"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000645 return "gz"
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100646 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000647 return "bz2"
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100648 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
649 return "xz"
650 else:
651 return "tar"
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000652
653 def close(self):
654 self.fileobj.close()
655# class StreamProxy
656
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000657#------------------------
658# Extraction file object
659#------------------------
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000660class _FileInFile(object):
661 """A thin wrapper around an existing file object that
662 provides a part of its data as an individual file
663 object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000664 """
665
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000666 def __init__(self, fileobj, offset, size, blockinfo=None):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000667 self.fileobj = fileobj
668 self.offset = offset
669 self.size = size
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000670 self.position = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000671
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000672 if blockinfo is None:
673 blockinfo = [(0, size)]
674
675 # Construct a map with data and zero blocks.
676 self.map_index = 0
677 self.map = []
678 lastpos = 0
679 realpos = self.offset
680 for offset, size in blockinfo:
681 if offset > lastpos:
682 self.map.append((False, lastpos, offset, None))
683 self.map.append((True, offset, offset + size, realpos))
684 realpos += size
685 lastpos = offset + size
686 if lastpos < self.size:
687 self.map.append((False, lastpos, self.size, None))
688
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000689 def seekable(self):
690 if not hasattr(self.fileobj, "seekable"):
691 # XXX gzip.GzipFile and bz2.BZ2File
692 return True
693 return self.fileobj.seekable()
694
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000695 def tell(self):
696 """Return the current file position.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000697 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000698 return self.position
699
700 def seek(self, position):
701 """Seek to a position in the file.
702 """
703 self.position = position
704
705 def read(self, size=None):
706 """Read data from the file.
707 """
708 if size is None:
709 size = self.size - self.position
710 else:
711 size = min(size, self.size - self.position)
712
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000713 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000714 while size > 0:
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000715 while True:
716 data, start, stop, offset = self.map[self.map_index]
717 if start <= self.position < stop:
718 break
719 else:
720 self.map_index += 1
721 if self.map_index == len(self.map):
722 self.map_index = 0
723 length = min(size, stop - self.position)
724 if data:
Lars Gustäbeldd071042011-02-23 11:42:22 +0000725 self.fileobj.seek(offset + (self.position - start))
726 buf += self.fileobj.read(length)
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000727 else:
728 buf += NUL * length
729 size -= length
730 self.position += length
731 return buf
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000732#class _FileInFile
733
734
735class ExFileObject(object):
736 """File-like object for reading an archive member.
737 Is returned by TarFile.extractfile().
738 """
739 blocksize = 1024
740
741 def __init__(self, tarfile, tarinfo):
742 self.fileobj = _FileInFile(tarfile.fileobj,
743 tarinfo.offset_data,
744 tarinfo.size,
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000745 tarinfo.sparse)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000746 self.name = tarinfo.name
747 self.mode = "r"
748 self.closed = False
749 self.size = tarinfo.size
750
751 self.position = 0
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000752 self.buffer = b""
753
754 def readable(self):
755 return True
756
757 def writable(self):
758 return False
759
760 def seekable(self):
761 return self.fileobj.seekable()
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000762
763 def read(self, size=None):
764 """Read at most size bytes from the file. If size is not
765 present or None, read all data until EOF is reached.
766 """
767 if self.closed:
768 raise ValueError("I/O operation on closed file")
769
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000770 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000771 if self.buffer:
772 if size is None:
773 buf = self.buffer
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000774 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000775 else:
776 buf = self.buffer[:size]
777 self.buffer = self.buffer[size:]
778
779 if size is None:
780 buf += self.fileobj.read()
781 else:
782 buf += self.fileobj.read(size - len(buf))
783
784 self.position += len(buf)
785 return buf
786
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000787 # XXX TextIOWrapper uses the read1() method.
788 read1 = read
789
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000790 def readline(self, size=-1):
791 """Read one entire line from the file. If size is present
792 and non-negative, return a string with at most that
793 size, which may be an incomplete line.
794 """
795 if self.closed:
796 raise ValueError("I/O operation on closed file")
797
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000798 pos = self.buffer.find(b"\n") + 1
799 if pos == 0:
800 # no newline found.
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000801 while True:
802 buf = self.fileobj.read(self.blocksize)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000803 self.buffer += buf
804 if not buf or b"\n" in buf:
805 pos = self.buffer.find(b"\n") + 1
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000806 if pos == 0:
807 # no newline found.
808 pos = len(self.buffer)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000809 break
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000810
811 if size != -1:
812 pos = min(size, pos)
813
814 buf = self.buffer[:pos]
815 self.buffer = self.buffer[pos:]
816 self.position += len(buf)
817 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000818
819 def readlines(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000820 """Return a list with all remaining lines.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000821 """
822 result = []
823 while True:
824 line = self.readline()
825 if not line: break
826 result.append(line)
827 return result
828
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000829 def tell(self):
830 """Return the current file position.
831 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000832 if self.closed:
833 raise ValueError("I/O operation on closed file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000834
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000835 return self.position
836
Eli Bendersky74c503b2012-01-03 06:26:13 +0200837 def seek(self, pos, whence=io.SEEK_SET):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000838 """Seek to a position in the file.
839 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000840 if self.closed:
841 raise ValueError("I/O operation on closed file")
842
Eli Bendersky74c503b2012-01-03 06:26:13 +0200843 if whence == io.SEEK_SET:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000844 self.position = min(max(pos, 0), self.size)
Eli Bendersky74c503b2012-01-03 06:26:13 +0200845 elif whence == io.SEEK_CUR:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000846 if pos < 0:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000847 self.position = max(self.position + pos, 0)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000848 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000849 self.position = min(self.position + pos, self.size)
Eli Bendersky74c503b2012-01-03 06:26:13 +0200850 elif whence == io.SEEK_END:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000851 self.position = max(min(self.size + pos, self.size), 0)
852 else:
853 raise ValueError("Invalid argument")
854
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000855 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000856 self.fileobj.seek(self.position)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000857
858 def close(self):
859 """Close the file object.
860 """
861 self.closed = True
Martin v. Löwisdf241532005-03-03 08:17:42 +0000862
863 def __iter__(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000864 """Get an iterator over the file's lines.
Martin v. Löwisdf241532005-03-03 08:17:42 +0000865 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000866 while True:
867 line = self.readline()
868 if not line:
869 break
870 yield line
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000871#class ExFileObject
872
873#------------------
874# Exported Classes
875#------------------
876class TarInfo(object):
877 """Informational class which holds the details about an
878 archive member given by a tar header block.
879 TarInfo objects are returned by TarFile.getmember(),
880 TarFile.getmembers() and TarFile.gettarinfo() and are
881 usually created internally.
882 """
883
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000884 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
885 "chksum", "type", "linkname", "uname", "gname",
886 "devmajor", "devminor",
887 "offset", "offset_data", "pax_headers", "sparse",
888 "tarfile", "_sparse_structs", "_link_target")
889
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000890 def __init__(self, name=""):
891 """Construct a TarInfo object. name is the optional name
892 of the member.
893 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000894 self.name = name # member name
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000895 self.mode = 0o644 # file permissions
Thomas Wouters477c8d52006-05-27 19:21:47 +0000896 self.uid = 0 # user id
897 self.gid = 0 # group id
898 self.size = 0 # file size
899 self.mtime = 0 # modification time
900 self.chksum = 0 # header checksum
901 self.type = REGTYPE # member type
902 self.linkname = "" # link name
Lars Gustäbel331b8002010-10-04 15:18:47 +0000903 self.uname = "" # user name
904 self.gname = "" # group name
Thomas Wouters477c8d52006-05-27 19:21:47 +0000905 self.devmajor = 0 # device major number
906 self.devminor = 0 # device minor number
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000907
Thomas Wouters477c8d52006-05-27 19:21:47 +0000908 self.offset = 0 # the tar header starts here
909 self.offset_data = 0 # the file's data starts here
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000910
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000911 self.sparse = None # sparse member information
Guido van Rossumd8faa362007-04-27 19:54:29 +0000912 self.pax_headers = {} # pax header information
913
914 # In pax headers the "name" and "linkname" field are called
915 # "path" and "linkpath".
916 def _getpath(self):
917 return self.name
918 def _setpath(self, name):
919 self.name = name
920 path = property(_getpath, _setpath)
921
922 def _getlinkpath(self):
923 return self.linkname
924 def _setlinkpath(self, linkname):
925 self.linkname = linkname
926 linkpath = property(_getlinkpath, _setlinkpath)
927
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000928 def __repr__(self):
929 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
930
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000931 def get_info(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000932 """Return the TarInfo's attributes as a dictionary.
933 """
934 info = {
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000935 "name": self.name,
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000936 "mode": self.mode & 0o7777,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000937 "uid": self.uid,
938 "gid": self.gid,
939 "size": self.size,
940 "mtime": self.mtime,
941 "chksum": self.chksum,
942 "type": self.type,
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000943 "linkname": self.linkname,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000944 "uname": self.uname,
945 "gname": self.gname,
946 "devmajor": self.devmajor,
947 "devminor": self.devminor
948 }
949
950 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
951 info["name"] += "/"
952
953 return info
954
Victor Stinnerde629d42010-05-05 21:43:57 +0000955 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000956 """Return a tar header as a string of 512 byte blocks.
957 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000958 info = self.get_info()
Guido van Rossume7ba4952007-06-06 23:52:48 +0000959
Guido van Rossumd8faa362007-04-27 19:54:29 +0000960 if format == USTAR_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000961 return self.create_ustar_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000962 elif format == GNU_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000963 return self.create_gnu_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000964 elif format == PAX_FORMAT:
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000965 return self.create_pax_header(info, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000966 else:
967 raise ValueError("invalid format")
968
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000969 def create_ustar_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000970 """Return the object as a ustar header block.
971 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000972 info["magic"] = POSIX_MAGIC
973
974 if len(info["linkname"]) > LENGTH_LINK:
975 raise ValueError("linkname is too long")
976
977 if len(info["name"]) > LENGTH_NAME:
978 info["prefix"], info["name"] = self._posix_split_name(info["name"])
979
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000980 return self._create_header(info, USTAR_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000981
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000982 def create_gnu_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000983 """Return the object as a GNU header block sequence.
984 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000985 info["magic"] = GNU_MAGIC
986
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000987 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +0000988 if len(info["linkname"]) > LENGTH_LINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000989 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000990
991 if len(info["name"]) > LENGTH_NAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000992 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000993
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000994 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000995
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000996 def create_pax_header(self, info, encoding):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000997 """Return the object as a ustar header block. If it cannot be
998 represented this way, prepend a pax extended header sequence
999 with supplement information.
1000 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001001 info["magic"] = POSIX_MAGIC
1002 pax_headers = self.pax_headers.copy()
1003
1004 # Test string fields for values that exceed the field length or cannot
1005 # be represented in ASCII encoding.
1006 for name, hname, length in (
1007 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
1008 ("uname", "uname", 32), ("gname", "gname", 32)):
1009
Guido van Rossume7ba4952007-06-06 23:52:48 +00001010 if hname in pax_headers:
1011 # The pax header has priority.
1012 continue
1013
Guido van Rossumd8faa362007-04-27 19:54:29 +00001014 # Try to encode the string as ASCII.
1015 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001016 info[name].encode("ascii", "strict")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001017 except UnicodeEncodeError:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001018 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001019 continue
1020
Guido van Rossume7ba4952007-06-06 23:52:48 +00001021 if len(info[name]) > length:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001022 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001023
1024 # Test number fields for values that exceed the field limit or values
1025 # that like to be stored as float.
1026 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001027 if name in pax_headers:
1028 # The pax header has priority. Avoid overflow.
1029 info[name] = 0
1030 continue
1031
Guido van Rossumd8faa362007-04-27 19:54:29 +00001032 val = info[name]
1033 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001034 pax_headers[name] = str(val)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001035 info[name] = 0
1036
Guido van Rossume7ba4952007-06-06 23:52:48 +00001037 # Create a pax extended header if necessary.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001038 if pax_headers:
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001039 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001040 else:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001041 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +00001042
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001043 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001044
1045 @classmethod
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001046 def create_pax_global_header(cls, pax_headers):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001047 """Return the object as a pax global header block sequence.
1048 """
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001049 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001050
1051 def _posix_split_name(self, name):
1052 """Split a name longer than 100 chars into a prefix
1053 and a name part.
1054 """
1055 prefix = name[:LENGTH_PREFIX + 1]
1056 while prefix and prefix[-1] != "/":
1057 prefix = prefix[:-1]
1058
1059 name = name[len(prefix):]
1060 prefix = prefix[:-1]
1061
1062 if not prefix or len(name) > LENGTH_NAME:
1063 raise ValueError("name is too long")
1064 return prefix, name
1065
1066 @staticmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001067 def _create_header(info, format, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001068 """Return a header block. info is a dictionary with file
1069 information, format must be one of the *_FORMAT constants.
1070 """
1071 parts = [
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001072 stn(info.get("name", ""), 100, encoding, errors),
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001073 itn(info.get("mode", 0) & 0o7777, 8, format),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001074 itn(info.get("uid", 0), 8, format),
1075 itn(info.get("gid", 0), 8, format),
1076 itn(info.get("size", 0), 12, format),
1077 itn(info.get("mtime", 0), 12, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001078 b" ", # checksum field
Guido van Rossumd8faa362007-04-27 19:54:29 +00001079 info.get("type", REGTYPE),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001080 stn(info.get("linkname", ""), 100, encoding, errors),
1081 info.get("magic", POSIX_MAGIC),
Lars Gustäbel331b8002010-10-04 15:18:47 +00001082 stn(info.get("uname", ""), 32, encoding, errors),
1083 stn(info.get("gname", ""), 32, encoding, errors),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001084 itn(info.get("devmajor", 0), 8, format),
1085 itn(info.get("devminor", 0), 8, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001086 stn(info.get("prefix", ""), 155, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001087 ]
1088
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001089 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001090 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
Lars Gustäbela280ca752007-08-28 07:34:33 +00001091 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001092 return buf
1093
1094 @staticmethod
1095 def _create_payload(payload):
1096 """Return the string payload filled with zero bytes
1097 up to the next 512 byte border.
1098 """
1099 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1100 if remainder > 0:
1101 payload += (BLOCKSIZE - remainder) * NUL
1102 return payload
1103
1104 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001105 def _create_gnu_long_header(cls, name, type, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001106 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1107 for name.
1108 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001109 name = name.encode(encoding, errors) + NUL
Guido van Rossumd8faa362007-04-27 19:54:29 +00001110
1111 info = {}
1112 info["name"] = "././@LongLink"
1113 info["type"] = type
1114 info["size"] = len(name)
1115 info["magic"] = GNU_MAGIC
1116
1117 # create extended header + name blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001118 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001119 cls._create_payload(name)
1120
1121 @classmethod
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001122 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1123 """Return a POSIX.1-2008 extended or global header sequence
Guido van Rossumd8faa362007-04-27 19:54:29 +00001124 that contains a list of keyword, value pairs. The values
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001125 must be strings.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001126 """
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001127 # Check if one of the fields contains surrogate characters and thereby
1128 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1129 binary = False
1130 for keyword, value in pax_headers.items():
1131 try:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001132 value.encode("utf-8", "strict")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001133 except UnicodeEncodeError:
1134 binary = True
1135 break
1136
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001137 records = b""
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001138 if binary:
1139 # Put the hdrcharset field at the beginning of the header.
1140 records += b"21 hdrcharset=BINARY\n"
1141
Guido van Rossumd8faa362007-04-27 19:54:29 +00001142 for keyword, value in pax_headers.items():
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001143 keyword = keyword.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001144 if binary:
1145 # Try to restore the original byte representation of `value'.
1146 # Needless to say, that the encoding must match the string.
1147 value = value.encode(encoding, "surrogateescape")
1148 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001149 value = value.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001150
Guido van Rossumd8faa362007-04-27 19:54:29 +00001151 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1152 n = p = 0
1153 while True:
1154 n = l + len(str(p))
1155 if n == p:
1156 break
1157 p = n
Lars Gustäbela280ca752007-08-28 07:34:33 +00001158 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
Guido van Rossumd8faa362007-04-27 19:54:29 +00001159
1160 # We use a hardcoded "././@PaxHeader" name like star does
1161 # instead of the one that POSIX recommends.
1162 info = {}
1163 info["name"] = "././@PaxHeader"
1164 info["type"] = type
1165 info["size"] = len(records)
1166 info["magic"] = POSIX_MAGIC
1167
1168 # Create pax header + record blocks.
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001169 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001170 cls._create_payload(records)
1171
Guido van Rossum75b64e62005-01-16 00:16:11 +00001172 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001173 def frombuf(cls, buf, encoding, errors):
1174 """Construct a TarInfo object from a 512 byte bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001175 """
Lars Gustäbel9520a432009-11-22 18:48:49 +00001176 if len(buf) == 0:
1177 raise EmptyHeaderError("empty header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001178 if len(buf) != BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001179 raise TruncatedHeaderError("truncated header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001180 if buf.count(NUL) == BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001181 raise EOFHeaderError("end of file header")
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001182
1183 chksum = nti(buf[148:156])
1184 if chksum not in calc_chksums(buf):
Lars Gustäbel9520a432009-11-22 18:48:49 +00001185 raise InvalidHeaderError("bad checksum")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001186
Guido van Rossumd8faa362007-04-27 19:54:29 +00001187 obj = cls()
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001188 obj.name = nts(buf[0:100], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001189 obj.mode = nti(buf[100:108])
1190 obj.uid = nti(buf[108:116])
1191 obj.gid = nti(buf[116:124])
1192 obj.size = nti(buf[124:136])
1193 obj.mtime = nti(buf[136:148])
1194 obj.chksum = chksum
1195 obj.type = buf[156:157]
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001196 obj.linkname = nts(buf[157:257], encoding, errors)
1197 obj.uname = nts(buf[265:297], encoding, errors)
1198 obj.gname = nts(buf[297:329], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001199 obj.devmajor = nti(buf[329:337])
1200 obj.devminor = nti(buf[337:345])
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001201 prefix = nts(buf[345:500], encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001202
Guido van Rossumd8faa362007-04-27 19:54:29 +00001203 # Old V7 tar format represents a directory as a regular
1204 # file with a trailing slash.
1205 if obj.type == AREGTYPE and obj.name.endswith("/"):
1206 obj.type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001207
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001208 # The old GNU sparse format occupies some of the unused
1209 # space in the buffer for up to 4 sparse structures.
1210 # Save the them for later processing in _proc_sparse().
1211 if obj.type == GNUTYPE_SPARSE:
1212 pos = 386
1213 structs = []
1214 for i in range(4):
1215 try:
1216 offset = nti(buf[pos:pos + 12])
1217 numbytes = nti(buf[pos + 12:pos + 24])
1218 except ValueError:
1219 break
1220 structs.append((offset, numbytes))
1221 pos += 24
1222 isextended = bool(buf[482])
1223 origsize = nti(buf[483:495])
1224 obj._sparse_structs = (structs, isextended, origsize)
1225
Guido van Rossumd8faa362007-04-27 19:54:29 +00001226 # Remove redundant slashes from directories.
1227 if obj.isdir():
1228 obj.name = obj.name.rstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001229
Guido van Rossumd8faa362007-04-27 19:54:29 +00001230 # Reconstruct a ustar longname.
1231 if prefix and obj.type not in GNU_TYPES:
1232 obj.name = prefix + "/" + obj.name
1233 return obj
1234
1235 @classmethod
1236 def fromtarfile(cls, tarfile):
1237 """Return the next TarInfo object from TarFile object
1238 tarfile.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001239 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001240 buf = tarfile.fileobj.read(BLOCKSIZE)
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001241 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001242 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1243 return obj._proc_member(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001244
Guido van Rossumd8faa362007-04-27 19:54:29 +00001245 #--------------------------------------------------------------------------
1246 # The following are methods that are called depending on the type of a
1247 # member. The entry point is _proc_member() which can be overridden in a
1248 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1249 # implement the following
1250 # operations:
1251 # 1. Set self.offset_data to the position where the data blocks begin,
1252 # if there is data that follows.
1253 # 2. Set tarfile.offset to the position where the next member's header will
1254 # begin.
1255 # 3. Return self or another valid TarInfo object.
1256 def _proc_member(self, tarfile):
1257 """Choose the right processing method depending on
1258 the type and call it.
Thomas Wouters89f507f2006-12-13 04:49:30 +00001259 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001260 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1261 return self._proc_gnulong(tarfile)
1262 elif self.type == GNUTYPE_SPARSE:
1263 return self._proc_sparse(tarfile)
1264 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1265 return self._proc_pax(tarfile)
1266 else:
1267 return self._proc_builtin(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001268
Guido van Rossumd8faa362007-04-27 19:54:29 +00001269 def _proc_builtin(self, tarfile):
1270 """Process a builtin type or an unknown type which
1271 will be treated as a regular file.
1272 """
1273 self.offset_data = tarfile.fileobj.tell()
1274 offset = self.offset_data
1275 if self.isreg() or self.type not in SUPPORTED_TYPES:
1276 # Skip the following data blocks.
1277 offset += self._block(self.size)
1278 tarfile.offset = offset
Thomas Wouters89f507f2006-12-13 04:49:30 +00001279
Guido van Rossume7ba4952007-06-06 23:52:48 +00001280 # Patch the TarInfo object with saved global
Guido van Rossumd8faa362007-04-27 19:54:29 +00001281 # header information.
Guido van Rossume7ba4952007-06-06 23:52:48 +00001282 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001283
1284 return self
1285
1286 def _proc_gnulong(self, tarfile):
1287 """Process the blocks that hold a GNU longname
1288 or longlink member.
1289 """
1290 buf = tarfile.fileobj.read(self._block(self.size))
1291
1292 # Fetch the next header and process it.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001293 try:
1294 next = self.fromtarfile(tarfile)
1295 except HeaderError:
1296 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001297
1298 # Patch the TarInfo object from the next header with
1299 # the longname information.
1300 next.offset = self.offset
1301 if self.type == GNUTYPE_LONGNAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001302 next.name = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001303 elif self.type == GNUTYPE_LONGLINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001304 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001305
1306 return next
1307
1308 def _proc_sparse(self, tarfile):
1309 """Process a GNU sparse header plus extra headers.
1310 """
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001311 # We already collected some sparse structures in frombuf().
1312 structs, isextended, origsize = self._sparse_structs
1313 del self._sparse_structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001314
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001315 # Collect sparse structures from extended header blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001316 while isextended:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001317 buf = tarfile.fileobj.read(BLOCKSIZE)
1318 pos = 0
Guido van Rossum805365e2007-05-07 22:24:25 +00001319 for i in range(21):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001320 try:
1321 offset = nti(buf[pos:pos + 12])
1322 numbytes = nti(buf[pos + 12:pos + 24])
1323 except ValueError:
1324 break
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001325 if offset and numbytes:
1326 structs.append((offset, numbytes))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001327 pos += 24
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001328 isextended = bool(buf[504])
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001329 self.sparse = structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001330
1331 self.offset_data = tarfile.fileobj.tell()
1332 tarfile.offset = self.offset_data + self._block(self.size)
1333 self.size = origsize
Guido van Rossumd8faa362007-04-27 19:54:29 +00001334 return self
1335
1336 def _proc_pax(self, tarfile):
1337 """Process an extended or global header as described in
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001338 POSIX.1-2008.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001339 """
1340 # Read the header information.
1341 buf = tarfile.fileobj.read(self._block(self.size))
1342
1343 # A pax header stores supplemental information for either
1344 # the following file (extended) or all following files
1345 # (global).
1346 if self.type == XGLTYPE:
1347 pax_headers = tarfile.pax_headers
1348 else:
1349 pax_headers = tarfile.pax_headers.copy()
1350
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001351 # Check if the pax header contains a hdrcharset field. This tells us
1352 # the encoding of the path, linkpath, uname and gname fields. Normally,
1353 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1354 # implementations are allowed to store them as raw binary strings if
1355 # the translation to UTF-8 fails.
1356 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1357 if match is not None:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001358 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001359
1360 # For the time being, we don't care about anything other than "BINARY".
1361 # The only other value that is currently allowed by the standard is
1362 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1363 hdrcharset = pax_headers.get("hdrcharset")
1364 if hdrcharset == "BINARY":
1365 encoding = tarfile.encoding
1366 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001367 encoding = "utf-8"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001368
Guido van Rossumd8faa362007-04-27 19:54:29 +00001369 # Parse pax header information. A record looks like that:
1370 # "%d %s=%s\n" % (length, keyword, value). length is the size
1371 # of the complete record including the length field itself and
Guido van Rossume7ba4952007-06-06 23:52:48 +00001372 # the newline. keyword and value are both UTF-8 encoded strings.
Antoine Pitroufd036452008-08-19 17:56:33 +00001373 regex = re.compile(br"(\d+) ([^=]+)=")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001374 pos = 0
1375 while True:
1376 match = regex.match(buf, pos)
1377 if not match:
1378 break
1379
1380 length, keyword = match.groups()
1381 length = int(length)
1382 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1383
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001384 # Normally, we could just use "utf-8" as the encoding and "strict"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001385 # as the error handler, but we better not take the risk. For
1386 # example, GNU tar <= 1.23 is known to store filenames it cannot
1387 # translate to UTF-8 as raw strings (unfortunately without a
1388 # hdrcharset=BINARY header).
1389 # We first try the strict standard encoding, and if that fails we
1390 # fall back on the user's encoding and error handler.
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001391 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001392 tarfile.errors)
1393 if keyword in PAX_NAME_FIELDS:
1394 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1395 tarfile.errors)
1396 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001397 value = self._decode_pax_field(value, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001398 tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001399
1400 pax_headers[keyword] = value
1401 pos += length
1402
Guido van Rossume7ba4952007-06-06 23:52:48 +00001403 # Fetch the next header.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001404 try:
1405 next = self.fromtarfile(tarfile)
1406 except HeaderError:
1407 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001408
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001409 # Process GNU sparse information.
1410 if "GNU.sparse.map" in pax_headers:
1411 # GNU extended sparse format version 0.1.
1412 self._proc_gnusparse_01(next, pax_headers)
1413
1414 elif "GNU.sparse.size" in pax_headers:
1415 # GNU extended sparse format version 0.0.
1416 self._proc_gnusparse_00(next, pax_headers, buf)
1417
1418 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1419 # GNU extended sparse format version 1.0.
1420 self._proc_gnusparse_10(next, pax_headers, tarfile)
1421
Guido van Rossume7ba4952007-06-06 23:52:48 +00001422 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001423 # Patch the TarInfo object with the extended header info.
1424 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1425 next.offset = self.offset
1426
1427 if "size" in pax_headers:
1428 # If the extended header replaces the size field,
1429 # we need to recalculate the offset where the next
1430 # header starts.
1431 offset = next.offset_data
1432 if next.isreg() or next.type not in SUPPORTED_TYPES:
1433 offset += next._block(next.size)
1434 tarfile.offset = offset
1435
1436 return next
1437
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001438 def _proc_gnusparse_00(self, next, pax_headers, buf):
1439 """Process a GNU tar extended sparse header, version 0.0.
1440 """
1441 offsets = []
1442 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1443 offsets.append(int(match.group(1)))
1444 numbytes = []
1445 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1446 numbytes.append(int(match.group(1)))
1447 next.sparse = list(zip(offsets, numbytes))
1448
1449 def _proc_gnusparse_01(self, next, pax_headers):
1450 """Process a GNU tar extended sparse header, version 0.1.
1451 """
1452 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1453 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1454
1455 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1456 """Process a GNU tar extended sparse header, version 1.0.
1457 """
1458 fields = None
1459 sparse = []
1460 buf = tarfile.fileobj.read(BLOCKSIZE)
1461 fields, buf = buf.split(b"\n", 1)
1462 fields = int(fields)
1463 while len(sparse) < fields * 2:
1464 if b"\n" not in buf:
1465 buf += tarfile.fileobj.read(BLOCKSIZE)
1466 number, buf = buf.split(b"\n", 1)
1467 sparse.append(int(number))
1468 next.offset_data = tarfile.fileobj.tell()
1469 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1470
Guido van Rossume7ba4952007-06-06 23:52:48 +00001471 def _apply_pax_info(self, pax_headers, encoding, errors):
1472 """Replace fields with supplemental information from a previous
1473 pax extended or global header.
1474 """
1475 for keyword, value in pax_headers.items():
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001476 if keyword == "GNU.sparse.name":
1477 setattr(self, "path", value)
1478 elif keyword == "GNU.sparse.size":
1479 setattr(self, "size", int(value))
1480 elif keyword == "GNU.sparse.realsize":
1481 setattr(self, "size", int(value))
1482 elif keyword in PAX_FIELDS:
1483 if keyword in PAX_NUMBER_FIELDS:
1484 try:
1485 value = PAX_NUMBER_FIELDS[keyword](value)
1486 except ValueError:
1487 value = 0
1488 if keyword == "path":
1489 value = value.rstrip("/")
1490 setattr(self, keyword, value)
Guido van Rossume7ba4952007-06-06 23:52:48 +00001491
1492 self.pax_headers = pax_headers.copy()
Guido van Rossumd8faa362007-04-27 19:54:29 +00001493
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001494 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1495 """Decode a single field from a pax record.
1496 """
1497 try:
1498 return value.decode(encoding, "strict")
1499 except UnicodeDecodeError:
1500 return value.decode(fallback_encoding, fallback_errors)
1501
Guido van Rossumd8faa362007-04-27 19:54:29 +00001502 def _block(self, count):
1503 """Round up a byte count by BLOCKSIZE and return it,
1504 e.g. _block(834) => 1024.
1505 """
1506 blocks, remainder = divmod(count, BLOCKSIZE)
1507 if remainder:
1508 blocks += 1
1509 return blocks * BLOCKSIZE
Thomas Wouters89f507f2006-12-13 04:49:30 +00001510
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001511 def isreg(self):
1512 return self.type in REGULAR_TYPES
1513 def isfile(self):
1514 return self.isreg()
1515 def isdir(self):
1516 return self.type == DIRTYPE
1517 def issym(self):
1518 return self.type == SYMTYPE
1519 def islnk(self):
1520 return self.type == LNKTYPE
1521 def ischr(self):
1522 return self.type == CHRTYPE
1523 def isblk(self):
1524 return self.type == BLKTYPE
1525 def isfifo(self):
1526 return self.type == FIFOTYPE
1527 def issparse(self):
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001528 return self.sparse is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001529 def isdev(self):
1530 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1531# class TarInfo
1532
1533class TarFile(object):
1534 """The TarFile Class provides an interface to tar archives.
1535 """
1536
1537 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1538
1539 dereference = False # If true, add content of linked file to the
1540 # tar file, else the link.
1541
1542 ignore_zeros = False # If true, skips empty or invalid blocks and
1543 # continues processing.
1544
Lars Gustäbel365aff32009-12-13 11:42:29 +00001545 errorlevel = 1 # If 0, fatal errors only appear in debug
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001546 # messages (if debug >= 0). If > 0, errors
1547 # are passed to the caller as exceptions.
1548
Guido van Rossumd8faa362007-04-27 19:54:29 +00001549 format = DEFAULT_FORMAT # The format to use when creating an archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001550
Guido van Rossume7ba4952007-06-06 23:52:48 +00001551 encoding = ENCODING # Encoding for 8-bit character strings.
1552
1553 errors = None # Error handler for unicode conversion.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001554
Guido van Rossumd8faa362007-04-27 19:54:29 +00001555 tarinfo = TarInfo # The default TarInfo class to use.
1556
1557 fileobject = ExFileObject # The default ExFileObject class to use.
1558
1559 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1560 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
Victor Stinnerde629d42010-05-05 21:43:57 +00001561 errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001562 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1563 read from an existing archive, 'a' to append data to an existing
1564 file or 'w' to create a new file overwriting an existing one. `mode'
1565 defaults to 'r'.
1566 If `fileobj' is given, it is used for reading or writing data. If it
1567 can be determined, `mode' is overridden by `fileobj's mode.
1568 `fileobj' is not closed, when TarFile is closed.
1569 """
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001570 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001571 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001572 self.mode = mode
1573 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001574
1575 if not fileobj:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001576 if self.mode == "a" and not os.path.exists(name):
Thomas Wouterscf297e42007-02-23 15:07:44 +00001577 # Create nonexistent files in append mode.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001578 self.mode = "w"
1579 self._mode = "wb"
Guido van Rossume7ba4952007-06-06 23:52:48 +00001580 fileobj = bltn_open(name, self._mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001581 self._extfileobj = False
1582 else:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001583 if name is None and hasattr(fileobj, "name"):
1584 name = fileobj.name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001585 if hasattr(fileobj, "mode"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001586 self._mode = fileobj.mode
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001587 self._extfileobj = True
Thomas Woutersed03b412007-08-28 21:37:11 +00001588 self.name = os.path.abspath(name) if name else None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001589 self.fileobj = fileobj
1590
Guido van Rossumd8faa362007-04-27 19:54:29 +00001591 # Init attributes.
1592 if format is not None:
1593 self.format = format
1594 if tarinfo is not None:
1595 self.tarinfo = tarinfo
1596 if dereference is not None:
1597 self.dereference = dereference
1598 if ignore_zeros is not None:
1599 self.ignore_zeros = ignore_zeros
1600 if encoding is not None:
1601 self.encoding = encoding
Victor Stinnerde629d42010-05-05 21:43:57 +00001602 self.errors = errors
Guido van Rossume7ba4952007-06-06 23:52:48 +00001603
1604 if pax_headers is not None and self.format == PAX_FORMAT:
1605 self.pax_headers = pax_headers
1606 else:
1607 self.pax_headers = {}
1608
Guido van Rossumd8faa362007-04-27 19:54:29 +00001609 if debug is not None:
1610 self.debug = debug
1611 if errorlevel is not None:
1612 self.errorlevel = errorlevel
1613
1614 # Init datastructures.
Thomas Wouters477c8d52006-05-27 19:21:47 +00001615 self.closed = False
1616 self.members = [] # list of members as TarInfo objects
1617 self._loaded = False # flag if all members have been read
Christian Heimesd8654cf2007-12-02 15:22:16 +00001618 self.offset = self.fileobj.tell()
1619 # current position in the archive file
Thomas Wouters477c8d52006-05-27 19:21:47 +00001620 self.inodes = {} # dictionary caching the inodes of
1621 # archive members already added
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001622
Lars Gustäbel7b465392009-11-18 20:29:25 +00001623 try:
1624 if self.mode == "r":
1625 self.firstmember = None
1626 self.firstmember = self.next()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001627
Lars Gustäbel7b465392009-11-18 20:29:25 +00001628 if self.mode == "a":
1629 # Move to the end of the archive,
1630 # before the first empty block.
Lars Gustäbel7b465392009-11-18 20:29:25 +00001631 while True:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001632 self.fileobj.seek(self.offset)
1633 try:
1634 tarinfo = self.tarinfo.fromtarfile(self)
1635 self.members.append(tarinfo)
1636 except EOFHeaderError:
1637 self.fileobj.seek(self.offset)
Lars Gustäbel7b465392009-11-18 20:29:25 +00001638 break
Lars Gustäbel9520a432009-11-22 18:48:49 +00001639 except HeaderError as e:
1640 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001641
Lars Gustäbel7b465392009-11-18 20:29:25 +00001642 if self.mode in "aw":
1643 self._loaded = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001644
Lars Gustäbel7b465392009-11-18 20:29:25 +00001645 if self.pax_headers:
1646 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1647 self.fileobj.write(buf)
1648 self.offset += len(buf)
1649 except:
1650 if not self._extfileobj:
1651 self.fileobj.close()
1652 self.closed = True
1653 raise
Guido van Rossumd8faa362007-04-27 19:54:29 +00001654
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001655 #--------------------------------------------------------------------------
1656 # Below are the classmethods which act as alternate constructors to the
1657 # TarFile class. The open() method is the only one that is needed for
1658 # public use; it is the "super"-constructor and is able to select an
1659 # adequate "sub"-constructor for a particular compression using the mapping
1660 # from OPEN_METH.
1661 #
1662 # This concept allows one to subclass TarFile without losing the comfort of
1663 # the super-constructor. A sub-constructor is registered and made available
1664 # by adding it to the mapping in OPEN_METH.
1665
Guido van Rossum75b64e62005-01-16 00:16:11 +00001666 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001667 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001668 """Open a tar archive for reading, writing or appending. Return
1669 an appropriate TarFile class.
1670
1671 mode:
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001672 'r' or 'r:*' open for reading with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001673 'r:' open for reading exclusively uncompressed
1674 'r:gz' open for reading with gzip compression
1675 'r:bz2' open for reading with bzip2 compression
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001676 'r:xz' open for reading with lzma compression
Thomas Wouterscf297e42007-02-23 15:07:44 +00001677 'a' or 'a:' open for appending, creating the file if necessary
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001678 'w' or 'w:' open for writing without compression
1679 'w:gz' open for writing with gzip compression
1680 'w:bz2' open for writing with bzip2 compression
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001681 'w:xz' open for writing with lzma compression
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001682
1683 'r|*' open a stream of tar blocks with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001684 'r|' open an uncompressed stream of tar blocks for reading
1685 'r|gz' open a gzip compressed stream of tar blocks
1686 'r|bz2' open a bzip2 compressed stream of tar blocks
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001687 'r|xz' open an lzma compressed stream of tar blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001688 'w|' open an uncompressed stream for writing
1689 'w|gz' open a gzip compressed stream for writing
1690 'w|bz2' open a bzip2 compressed stream for writing
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001691 'w|xz' open an lzma compressed stream for writing
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001692 """
1693
1694 if not name and not fileobj:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001695 raise ValueError("nothing to open")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001696
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001697 if mode in ("r", "r:*"):
1698 # Find out which *open() is appropriate for opening the file.
1699 for comptype in cls.OPEN_METH:
1700 func = getattr(cls, cls.OPEN_METH[comptype])
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001701 if fileobj is not None:
1702 saved_pos = fileobj.tell()
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001703 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001704 return func(name, "r", fileobj, **kwargs)
1705 except (ReadError, CompressionError) as e:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001706 if fileobj is not None:
1707 fileobj.seek(saved_pos)
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001708 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001709 raise ReadError("file could not be opened successfully")
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001710
1711 elif ":" in mode:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001712 filemode, comptype = mode.split(":", 1)
1713 filemode = filemode or "r"
1714 comptype = comptype or "tar"
1715
1716 # Select the *open() function according to
1717 # given compression.
1718 if comptype in cls.OPEN_METH:
1719 func = getattr(cls, cls.OPEN_METH[comptype])
1720 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001721 raise CompressionError("unknown compression type %r" % comptype)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001722 return func(name, filemode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001723
1724 elif "|" in mode:
1725 filemode, comptype = mode.split("|", 1)
1726 filemode = filemode or "r"
1727 comptype = comptype or "tar"
1728
1729 if filemode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001730 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001731
Antoine Pitrou605c2932010-09-23 20:15:14 +00001732 stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1733 try:
1734 t = cls(name, filemode, stream, **kwargs)
1735 except:
1736 stream.close()
1737 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001738 t._extfileobj = False
1739 return t
1740
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001741 elif mode in "aw":
Guido van Rossumd8faa362007-04-27 19:54:29 +00001742 return cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001743
Thomas Wouters477c8d52006-05-27 19:21:47 +00001744 raise ValueError("undiscernible mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001745
Guido van Rossum75b64e62005-01-16 00:16:11 +00001746 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001747 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001748 """Open uncompressed tar archive name for reading or writing.
1749 """
1750 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001751 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001752 return cls(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001753
Guido van Rossum75b64e62005-01-16 00:16:11 +00001754 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001755 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001756 """Open gzip compressed tar archive name for reading or writing.
1757 Appending is not allowed.
1758 """
1759 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001760 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001761
1762 try:
1763 import gzip
Neal Norwitz4ec68242003-04-11 03:05:56 +00001764 gzip.GzipFile
1765 except (ImportError, AttributeError):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001766 raise CompressionError("gzip module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001767
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001768 extfileobj = fileobj is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001769 try:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001770 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
1771 t = cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001772 except IOError:
Georg Brandl3abb3722011-08-13 11:48:12 +02001773 if not extfileobj and fileobj is not None:
Antoine Pitrou95f55602010-09-23 18:36:46 +00001774 fileobj.close()
Georg Brandl3abb3722011-08-13 11:48:12 +02001775 if fileobj is None:
1776 raise
Thomas Wouters477c8d52006-05-27 19:21:47 +00001777 raise ReadError("not a gzip file")
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001778 except:
Georg Brandl3abb3722011-08-13 11:48:12 +02001779 if not extfileobj and fileobj is not None:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001780 fileobj.close()
1781 raise
Antoine Pitrou95f55602010-09-23 18:36:46 +00001782 t._extfileobj = extfileobj
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001783 return t
1784
Guido van Rossum75b64e62005-01-16 00:16:11 +00001785 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001786 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001787 """Open bzip2 compressed tar archive name for reading or writing.
1788 Appending is not allowed.
1789 """
1790 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001791 raise ValueError("mode must be 'r' or 'w'.")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001792
1793 try:
1794 import bz2
1795 except ImportError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001796 raise CompressionError("bz2 module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001797
Lars Gustäbelbb44b732011-12-06 13:44:10 +01001798 fileobj = bz2.BZ2File(filename=name if fileobj is None else None,
1799 mode=mode, fileobj=fileobj, compresslevel=compresslevel)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001800
1801 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001802 t = cls.taropen(name, mode, fileobj, **kwargs)
Lars Gustäbel9520a432009-11-22 18:48:49 +00001803 except (IOError, EOFError):
Antoine Pitrou95f55602010-09-23 18:36:46 +00001804 fileobj.close()
Thomas Wouters477c8d52006-05-27 19:21:47 +00001805 raise ReadError("not a bzip2 file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001806 t._extfileobj = False
1807 return t
1808
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001809 @classmethod
1810 def xzopen(cls, name, mode="r", fileobj=None, preset=9, **kwargs):
1811 """Open lzma compressed tar archive name for reading or writing.
1812 Appending is not allowed.
1813 """
1814 if mode not in ("r", "w"):
1815 raise ValueError("mode must be 'r' or 'w'")
1816
1817 try:
1818 import lzma
1819 except ImportError:
1820 raise CompressionError("lzma module is not available")
1821
1822 if mode == "r":
1823 # LZMAFile complains about a preset argument in read mode.
1824 preset = None
1825
1826 fileobj = lzma.LZMAFile(filename=name if fileobj is None else None,
1827 mode=mode, fileobj=fileobj, preset=preset)
1828
1829 try:
1830 t = cls.taropen(name, mode, fileobj, **kwargs)
1831 except (lzma.LZMAError, EOFError):
1832 fileobj.close()
1833 raise ReadError("not an lzma file")
1834 t._extfileobj = False
1835 return t
1836
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001837 # All *open() methods are registered here.
1838 OPEN_METH = {
1839 "tar": "taropen", # uncompressed tar
1840 "gz": "gzopen", # gzip compressed tar
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001841 "bz2": "bz2open", # bzip2 compressed tar
1842 "xz": "xzopen" # lzma compressed tar
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001843 }
1844
1845 #--------------------------------------------------------------------------
1846 # The public methods which TarFile provides:
1847
1848 def close(self):
1849 """Close the TarFile. In write-mode, two finishing zero blocks are
1850 appended to the archive.
1851 """
1852 if self.closed:
1853 return
1854
Guido van Rossumd8faa362007-04-27 19:54:29 +00001855 if self.mode in "aw":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001856 self.fileobj.write(NUL * (BLOCKSIZE * 2))
1857 self.offset += (BLOCKSIZE * 2)
1858 # fill up the end with zero-blocks
1859 # (like option -b20 for tar does)
1860 blocks, remainder = divmod(self.offset, RECORDSIZE)
1861 if remainder > 0:
1862 self.fileobj.write(NUL * (RECORDSIZE - remainder))
1863
1864 if not self._extfileobj:
1865 self.fileobj.close()
1866 self.closed = True
1867
1868 def getmember(self, name):
1869 """Return a TarInfo object for member `name'. If `name' can not be
1870 found in the archive, KeyError is raised. If a member occurs more
Mark Dickinson934896d2009-02-21 20:59:32 +00001871 than once in the archive, its last occurrence is assumed to be the
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001872 most up-to-date version.
1873 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001874 tarinfo = self._getmember(name)
1875 if tarinfo is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001876 raise KeyError("filename %r not found" % name)
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001877 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001878
1879 def getmembers(self):
1880 """Return the members of the archive as a list of TarInfo objects. The
1881 list has the same order as the members in the archive.
1882 """
1883 self._check()
1884 if not self._loaded: # if we want to obtain a list of
1885 self._load() # all members, we first have to
1886 # scan the whole archive.
1887 return self.members
1888
1889 def getnames(self):
1890 """Return the members of the archive as a list of their names. It has
1891 the same order as the list returned by getmembers().
1892 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001893 return [tarinfo.name for tarinfo in self.getmembers()]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001894
1895 def gettarinfo(self, name=None, arcname=None, fileobj=None):
1896 """Create a TarInfo object for either the file `name' or the file
1897 object `fileobj' (using os.fstat on its file descriptor). You can
1898 modify some of the TarInfo's attributes before you add it using
1899 addfile(). If given, `arcname' specifies an alternative name for the
1900 file in the archive.
1901 """
1902 self._check("aw")
1903
1904 # When fileobj is given, replace name by
1905 # fileobj's real name.
1906 if fileobj is not None:
1907 name = fileobj.name
1908
1909 # Building the name of the member in the archive.
1910 # Backward slashes are converted to forward slashes,
1911 # Absolute paths are turned to relative paths.
1912 if arcname is None:
1913 arcname = name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001914 drv, arcname = os.path.splitdrive(arcname)
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00001915 arcname = arcname.replace(os.sep, "/")
1916 arcname = arcname.lstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001917
1918 # Now, fill the TarInfo object with
1919 # information specific for the file.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001920 tarinfo = self.tarinfo()
1921 tarinfo.tarfile = self
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001922
1923 # Use os.stat or os.lstat, depending on platform
1924 # and if symlinks shall be resolved.
1925 if fileobj is None:
1926 if hasattr(os, "lstat") and not self.dereference:
1927 statres = os.lstat(name)
1928 else:
1929 statres = os.stat(name)
1930 else:
1931 statres = os.fstat(fileobj.fileno())
1932 linkname = ""
1933
1934 stmd = statres.st_mode
1935 if stat.S_ISREG(stmd):
1936 inode = (statres.st_ino, statres.st_dev)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001937 if not self.dereference and statres.st_nlink > 1 and \
1938 inode in self.inodes and arcname != self.inodes[inode]:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001939 # Is it a hardlink to an already
1940 # archived file?
1941 type = LNKTYPE
1942 linkname = self.inodes[inode]
1943 else:
1944 # The inode is added only if its valid.
1945 # For win32 it is always 0.
1946 type = REGTYPE
1947 if inode[0]:
1948 self.inodes[inode] = arcname
1949 elif stat.S_ISDIR(stmd):
1950 type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001951 elif stat.S_ISFIFO(stmd):
1952 type = FIFOTYPE
1953 elif stat.S_ISLNK(stmd):
1954 type = SYMTYPE
1955 linkname = os.readlink(name)
1956 elif stat.S_ISCHR(stmd):
1957 type = CHRTYPE
1958 elif stat.S_ISBLK(stmd):
1959 type = BLKTYPE
1960 else:
1961 return None
1962
1963 # Fill the TarInfo object with all
1964 # information we can get.
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001965 tarinfo.name = arcname
1966 tarinfo.mode = stmd
1967 tarinfo.uid = statres.st_uid
1968 tarinfo.gid = statres.st_gid
Lars Gustäbel2470ff12010-06-03 10:11:52 +00001969 if type == REGTYPE:
Martin v. Löwis61d77e02004-08-20 06:35:46 +00001970 tarinfo.size = statres.st_size
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001971 else:
Guido van Rossume2a383d2007-01-15 16:59:06 +00001972 tarinfo.size = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001973 tarinfo.mtime = statres.st_mtime
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001974 tarinfo.type = type
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001975 tarinfo.linkname = linkname
1976 if pwd:
1977 try:
1978 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1979 except KeyError:
1980 pass
1981 if grp:
1982 try:
1983 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1984 except KeyError:
1985 pass
1986
1987 if type in (CHRTYPE, BLKTYPE):
1988 if hasattr(os, "major") and hasattr(os, "minor"):
1989 tarinfo.devmajor = os.major(statres.st_rdev)
1990 tarinfo.devminor = os.minor(statres.st_rdev)
1991 return tarinfo
1992
1993 def list(self, verbose=True):
1994 """Print a table of contents to sys.stdout. If `verbose' is False, only
1995 the names of the members are printed. If it is True, an `ls -l'-like
1996 output is produced.
1997 """
1998 self._check()
1999
2000 for tarinfo in self:
2001 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002002 print(filemode(tarinfo.mode), end=' ')
2003 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2004 tarinfo.gname or tarinfo.gid), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002005 if tarinfo.ischr() or tarinfo.isblk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002006 print("%10s" % ("%d,%d" \
2007 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002008 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002009 print("%10d" % tarinfo.size, end=' ')
2010 print("%d-%02d-%02d %02d:%02d:%02d" \
2011 % time.localtime(tarinfo.mtime)[:6], end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002012
Guido van Rossumd8faa362007-04-27 19:54:29 +00002013 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002014
2015 if verbose:
2016 if tarinfo.issym():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002017 print("->", tarinfo.linkname, end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002018 if tarinfo.islnk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002019 print("link to", tarinfo.linkname, end=' ')
2020 print()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002021
Raymond Hettingera63a3122011-01-26 20:34:14 +00002022 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002023 """Add the file `name' to the archive. `name' may be any type of file
2024 (directory, fifo, symbolic link, etc.). If given, `arcname'
2025 specifies an alternative name for the file in the archive.
2026 Directories are added recursively by default. This can be avoided by
Guido van Rossum486364b2007-06-30 05:01:58 +00002027 setting `recursive' to False. `exclude' is a function that should
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002028 return True for each filename to be excluded. `filter' is a function
2029 that expects a TarInfo object argument and returns the changed
2030 TarInfo object, if it returns None the TarInfo object will be
2031 excluded from the archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002032 """
2033 self._check("aw")
2034
2035 if arcname is None:
2036 arcname = name
2037
Guido van Rossum486364b2007-06-30 05:01:58 +00002038 # Exclude pathnames.
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002039 if exclude is not None:
2040 import warnings
2041 warnings.warn("use the filter argument instead",
2042 DeprecationWarning, 2)
2043 if exclude(name):
2044 self._dbg(2, "tarfile: Excluded %r" % name)
2045 return
Guido van Rossum486364b2007-06-30 05:01:58 +00002046
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002047 # Skip if somebody tries to archive the archive...
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002048 if self.name is not None and os.path.abspath(name) == self.name:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002049 self._dbg(2, "tarfile: Skipped %r" % name)
2050 return
2051
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002052 self._dbg(1, name)
2053
2054 # Create a TarInfo object from the file.
2055 tarinfo = self.gettarinfo(name, arcname)
2056
2057 if tarinfo is None:
2058 self._dbg(1, "tarfile: Unsupported type %r" % name)
2059 return
2060
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002061 # Change or exclude the TarInfo object.
2062 if filter is not None:
2063 tarinfo = filter(tarinfo)
2064 if tarinfo is None:
2065 self._dbg(2, "tarfile: Excluded %r" % name)
2066 return
2067
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002068 # Append the tar header and data to the archive.
2069 if tarinfo.isreg():
Guido van Rossume7ba4952007-06-06 23:52:48 +00002070 f = bltn_open(name, "rb")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002071 self.addfile(tarinfo, f)
2072 f.close()
2073
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002074 elif tarinfo.isdir():
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002075 self.addfile(tarinfo)
2076 if recursive:
2077 for f in os.listdir(name):
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002078 self.add(os.path.join(name, f), os.path.join(arcname, f),
Raymond Hettingera63a3122011-01-26 20:34:14 +00002079 recursive, exclude, filter=filter)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002080
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002081 else:
2082 self.addfile(tarinfo)
2083
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002084 def addfile(self, tarinfo, fileobj=None):
2085 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2086 given, tarinfo.size bytes are read from it and added to the archive.
2087 You can create TarInfo objects using gettarinfo().
2088 On Windows platforms, `fileobj' should always be opened with mode
2089 'rb' to avoid irritation about the file size.
2090 """
2091 self._check("aw")
2092
Thomas Wouters89f507f2006-12-13 04:49:30 +00002093 tarinfo = copy.copy(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002094
Guido van Rossume7ba4952007-06-06 23:52:48 +00002095 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002096 self.fileobj.write(buf)
2097 self.offset += len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002098
2099 # If there's data to follow, append it.
2100 if fileobj is not None:
2101 copyfileobj(fileobj, self.fileobj, tarinfo.size)
2102 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
2103 if remainder > 0:
2104 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2105 blocks += 1
2106 self.offset += blocks * BLOCKSIZE
2107
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002108 self.members.append(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002109
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002110 def extractall(self, path=".", members=None):
2111 """Extract all members from the archive to the current working
2112 directory and set owner, modification time and permissions on
2113 directories afterwards. `path' specifies a different directory
2114 to extract to. `members' is optional and must be a subset of the
2115 list returned by getmembers().
2116 """
2117 directories = []
2118
2119 if members is None:
2120 members = self
2121
2122 for tarinfo in members:
2123 if tarinfo.isdir():
Christian Heimes2202f872008-02-06 14:31:34 +00002124 # Extract directories with a safe mode.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002125 directories.append(tarinfo)
Christian Heimes2202f872008-02-06 14:31:34 +00002126 tarinfo = copy.copy(tarinfo)
2127 tarinfo.mode = 0o700
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002128 # Do not set_attrs directories, as we will do that further down
2129 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002130
2131 # Reverse sort directories.
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +00002132 directories.sort(key=lambda a: a.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002133 directories.reverse()
2134
2135 # Set correct owner, mtime and filemode on directories.
2136 for tarinfo in directories:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002137 dirpath = os.path.join(path, tarinfo.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002138 try:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002139 self.chown(tarinfo, dirpath)
2140 self.utime(tarinfo, dirpath)
2141 self.chmod(tarinfo, dirpath)
Guido van Rossumb940e112007-01-10 16:19:56 +00002142 except ExtractError as e:
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002143 if self.errorlevel > 1:
2144 raise
2145 else:
2146 self._dbg(1, "tarfile: %s" % e)
2147
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002148 def extract(self, member, path="", set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002149 """Extract a member from the archive to the current working directory,
2150 using its full name. Its file information is extracted as accurately
2151 as possible. `member' may be a filename or a TarInfo object. You can
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002152 specify a different directory using `path'. File attributes (owner,
2153 mtime, mode) are set unless `set_attrs' is False.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002154 """
2155 self._check("r")
2156
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002157 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002158 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002159 else:
2160 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002161
Neal Norwitza4f651a2004-07-20 22:07:44 +00002162 # Prepare the link target for makelink().
2163 if tarinfo.islnk():
2164 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2165
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002166 try:
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002167 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2168 set_attrs=set_attrs)
Guido van Rossumb940e112007-01-10 16:19:56 +00002169 except EnvironmentError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002170 if self.errorlevel > 0:
2171 raise
2172 else:
2173 if e.filename is None:
2174 self._dbg(1, "tarfile: %s" % e.strerror)
2175 else:
2176 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
Guido van Rossumb940e112007-01-10 16:19:56 +00002177 except ExtractError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002178 if self.errorlevel > 1:
2179 raise
2180 else:
2181 self._dbg(1, "tarfile: %s" % e)
2182
2183 def extractfile(self, member):
2184 """Extract a member from the archive as a file object. `member' may be
2185 a filename or a TarInfo object. If `member' is a regular file, a
2186 file-like object is returned. If `member' is a link, a file-like
2187 object is constructed from the link's target. If `member' is none of
2188 the above, None is returned.
2189 The file-like object is read-only and provides the following
2190 methods: read(), readline(), readlines(), seek() and tell()
2191 """
2192 self._check("r")
2193
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002194 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002195 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002196 else:
2197 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002198
2199 if tarinfo.isreg():
2200 return self.fileobject(self, tarinfo)
2201
2202 elif tarinfo.type not in SUPPORTED_TYPES:
2203 # If a member's type is unknown, it is treated as a
2204 # regular file.
2205 return self.fileobject(self, tarinfo)
2206
2207 elif tarinfo.islnk() or tarinfo.issym():
2208 if isinstance(self.fileobj, _Stream):
2209 # A small but ugly workaround for the case that someone tries
2210 # to extract a (sym)link as a file-object from a non-seekable
2211 # stream of tar blocks.
Thomas Wouters477c8d52006-05-27 19:21:47 +00002212 raise StreamError("cannot extract (sym)link as file object")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002213 else:
Georg Brandl7eb4b7d2005-07-22 21:49:32 +00002214 # A (sym)link's file object is its target's file object.
Lars Gustäbel1b512722010-06-03 12:45:16 +00002215 return self.extractfile(self._find_link_target(tarinfo))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002216 else:
2217 # If there's no data associated with the member (directory, chrdev,
2218 # blkdev, etc.), return None instead of a file object.
2219 return None
2220
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002221 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002222 """Extract the TarInfo object tarinfo to a physical
2223 file called targetpath.
2224 """
2225 # Fetch the TarInfo object for the given name
2226 # and build the destination pathname, replacing
2227 # forward slashes to platform specific separators.
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002228 targetpath = targetpath.rstrip("/")
2229 targetpath = targetpath.replace("/", os.sep)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002230
2231 # Create all upper directories.
2232 upperdirs = os.path.dirname(targetpath)
2233 if upperdirs and not os.path.exists(upperdirs):
Christian Heimes2202f872008-02-06 14:31:34 +00002234 # Create directories that are not part of the archive with
2235 # default permissions.
Thomas Woutersb2137042007-02-01 18:02:27 +00002236 os.makedirs(upperdirs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002237
2238 if tarinfo.islnk() or tarinfo.issym():
2239 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2240 else:
2241 self._dbg(1, tarinfo.name)
2242
2243 if tarinfo.isreg():
2244 self.makefile(tarinfo, targetpath)
2245 elif tarinfo.isdir():
2246 self.makedir(tarinfo, targetpath)
2247 elif tarinfo.isfifo():
2248 self.makefifo(tarinfo, targetpath)
2249 elif tarinfo.ischr() or tarinfo.isblk():
2250 self.makedev(tarinfo, targetpath)
2251 elif tarinfo.islnk() or tarinfo.issym():
2252 self.makelink(tarinfo, targetpath)
2253 elif tarinfo.type not in SUPPORTED_TYPES:
2254 self.makeunknown(tarinfo, targetpath)
2255 else:
2256 self.makefile(tarinfo, targetpath)
2257
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002258 if set_attrs:
2259 self.chown(tarinfo, targetpath)
2260 if not tarinfo.issym():
2261 self.chmod(tarinfo, targetpath)
2262 self.utime(tarinfo, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002263
2264 #--------------------------------------------------------------------------
2265 # Below are the different file methods. They are called via
2266 # _extract_member() when extract() is called. They can be replaced in a
2267 # subclass to implement other functionality.
2268
2269 def makedir(self, tarinfo, targetpath):
2270 """Make a directory called targetpath.
2271 """
2272 try:
Christian Heimes2202f872008-02-06 14:31:34 +00002273 # Use a safe mode for the directory, the real mode is set
2274 # later in _extract_member().
2275 os.mkdir(targetpath, 0o700)
Florent Xicluna68f71a32011-10-28 16:06:23 +02002276 except FileExistsError:
2277 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002278
2279 def makefile(self, tarinfo, targetpath):
2280 """Make a file called targetpath.
2281 """
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002282 source = self.fileobj
2283 source.seek(tarinfo.offset_data)
Guido van Rossume7ba4952007-06-06 23:52:48 +00002284 target = bltn_open(targetpath, "wb")
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002285 if tarinfo.sparse is not None:
2286 for offset, size in tarinfo.sparse:
2287 target.seek(offset)
2288 copyfileobj(source, target, size)
2289 else:
2290 copyfileobj(source, target, tarinfo.size)
2291 target.seek(tarinfo.size)
2292 target.truncate()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002293 target.close()
2294
2295 def makeunknown(self, tarinfo, targetpath):
2296 """Make a file from a TarInfo object with an unknown type
2297 at targetpath.
2298 """
2299 self.makefile(tarinfo, targetpath)
2300 self._dbg(1, "tarfile: Unknown file type %r, " \
2301 "extracted as regular file." % tarinfo.type)
2302
2303 def makefifo(self, tarinfo, targetpath):
2304 """Make a fifo called targetpath.
2305 """
2306 if hasattr(os, "mkfifo"):
2307 os.mkfifo(targetpath)
2308 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002309 raise ExtractError("fifo not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002310
2311 def makedev(self, tarinfo, targetpath):
2312 """Make a character or block device called targetpath.
2313 """
2314 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00002315 raise ExtractError("special devices not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002316
2317 mode = tarinfo.mode
2318 if tarinfo.isblk():
2319 mode |= stat.S_IFBLK
2320 else:
2321 mode |= stat.S_IFCHR
2322
2323 os.mknod(targetpath, mode,
2324 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2325
2326 def makelink(self, tarinfo, targetpath):
2327 """Make a (symbolic) link called targetpath. If it cannot be created
2328 (platform limitation), we try to make a copy of the referenced file
2329 instead of a link.
2330 """
Brian Curtind40e6f72010-07-08 21:39:08 +00002331 try:
Lars Gustäbel1b512722010-06-03 12:45:16 +00002332 # For systems that support symbolic and hard links.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002333 if tarinfo.issym():
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002334 os.symlink(tarinfo.linkname, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002335 else:
Neal Norwitza4f651a2004-07-20 22:07:44 +00002336 # See extract().
Lars Gustäbel1b512722010-06-03 12:45:16 +00002337 if os.path.exists(tarinfo._link_target):
2338 os.link(tarinfo._link_target, targetpath)
2339 else:
Brian Curtin82df53e2010-09-24 21:04:05 +00002340 self._extract_member(self._find_link_target(tarinfo),
2341 targetpath)
Brian Curtin16633fa2010-07-09 13:54:27 +00002342 except symlink_exception:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002343 try:
Brian Curtin16633fa2010-07-09 13:54:27 +00002344 self._extract_member(self._find_link_target(tarinfo),
2345 targetpath)
Lars Gustäbel1b512722010-06-03 12:45:16 +00002346 except KeyError:
2347 raise ExtractError("unable to resolve link inside archive")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002348
2349 def chown(self, tarinfo, targetpath):
2350 """Set owner of targetpath according to tarinfo.
2351 """
2352 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2353 # We have to be root to do so.
2354 try:
2355 g = grp.getgrnam(tarinfo.gname)[2]
2356 except KeyError:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002357 g = tarinfo.gid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002358 try:
2359 u = pwd.getpwnam(tarinfo.uname)[2]
2360 except KeyError:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002361 u = tarinfo.uid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002362 try:
2363 if tarinfo.issym() and hasattr(os, "lchown"):
2364 os.lchown(targetpath, u, g)
2365 else:
Andrew MacIntyre7970d202003-02-19 12:51:34 +00002366 if sys.platform != "os2emx":
2367 os.chown(targetpath, u, g)
Guido van Rossumb940e112007-01-10 16:19:56 +00002368 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002369 raise ExtractError("could not change owner")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002370
2371 def chmod(self, tarinfo, targetpath):
2372 """Set file permissions of targetpath according to tarinfo.
2373 """
Jack Jansen834eff62003-03-07 12:47:06 +00002374 if hasattr(os, 'chmod'):
2375 try:
2376 os.chmod(targetpath, tarinfo.mode)
Guido van Rossumb940e112007-01-10 16:19:56 +00002377 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002378 raise ExtractError("could not change mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002379
2380 def utime(self, tarinfo, targetpath):
2381 """Set modification time of targetpath according to tarinfo.
2382 """
Jack Jansen834eff62003-03-07 12:47:06 +00002383 if not hasattr(os, 'utime'):
Tim Petersf9347782003-03-07 15:36:41 +00002384 return
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002385 try:
2386 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
Guido van Rossumb940e112007-01-10 16:19:56 +00002387 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002388 raise ExtractError("could not change modification time")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002389
2390 #--------------------------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002391 def next(self):
2392 """Return the next member of the archive as a TarInfo object, when
2393 TarFile is opened for reading. Return None if there is no more
2394 available.
2395 """
2396 self._check("ra")
2397 if self.firstmember is not None:
2398 m = self.firstmember
2399 self.firstmember = None
2400 return m
2401
2402 # Read the next block.
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002403 self.fileobj.seek(self.offset)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002404 tarinfo = None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002405 while True:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002406 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00002407 tarinfo = self.tarinfo.fromtarfile(self)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002408 except EOFHeaderError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002409 if self.ignore_zeros:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002410 self._dbg(2, "0x%X: %s" % (self.offset, e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002411 self.offset += BLOCKSIZE
2412 continue
Lars Gustäbel9520a432009-11-22 18:48:49 +00002413 except InvalidHeaderError as e:
2414 if self.ignore_zeros:
2415 self._dbg(2, "0x%X: %s" % (self.offset, e))
2416 self.offset += BLOCKSIZE
2417 continue
2418 elif self.offset == 0:
2419 raise ReadError(str(e))
2420 except EmptyHeaderError:
2421 if self.offset == 0:
2422 raise ReadError("empty file")
2423 except TruncatedHeaderError as e:
2424 if self.offset == 0:
2425 raise ReadError(str(e))
2426 except SubsequentHeaderError as e:
2427 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002428 break
2429
Lars Gustäbel9520a432009-11-22 18:48:49 +00002430 if tarinfo is not None:
2431 self.members.append(tarinfo)
2432 else:
2433 self._loaded = True
2434
Thomas Wouters477c8d52006-05-27 19:21:47 +00002435 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002436
2437 #--------------------------------------------------------------------------
2438 # Little helper methods:
2439
Lars Gustäbel1b512722010-06-03 12:45:16 +00002440 def _getmember(self, name, tarinfo=None, normalize=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002441 """Find an archive member by name from bottom to top.
2442 If tarinfo is given, it is used as the starting point.
2443 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002444 # Ensure that all members have been loaded.
2445 members = self.getmembers()
2446
Lars Gustäbel1b512722010-06-03 12:45:16 +00002447 # Limit the member search list up to tarinfo.
2448 if tarinfo is not None:
2449 members = members[:members.index(tarinfo)]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002450
Lars Gustäbel1b512722010-06-03 12:45:16 +00002451 if normalize:
2452 name = os.path.normpath(name)
2453
2454 for member in reversed(members):
2455 if normalize:
2456 member_name = os.path.normpath(member.name)
2457 else:
2458 member_name = member.name
2459
2460 if name == member_name:
2461 return member
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002462
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002463 def _load(self):
2464 """Read through the entire archive file and look for readable
2465 members.
2466 """
2467 while True:
2468 tarinfo = self.next()
2469 if tarinfo is None:
2470 break
2471 self._loaded = True
2472
2473 def _check(self, mode=None):
2474 """Check if TarFile is still open, and if the operation's mode
2475 corresponds to TarFile's mode.
2476 """
2477 if self.closed:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002478 raise IOError("%s is closed" % self.__class__.__name__)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002479 if mode is not None and self.mode not in mode:
2480 raise IOError("bad operation for mode %r" % self.mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002481
Lars Gustäbel1b512722010-06-03 12:45:16 +00002482 def _find_link_target(self, tarinfo):
2483 """Find the target member of a symlink or hardlink member in the
2484 archive.
2485 """
2486 if tarinfo.issym():
2487 # Always search the entire archive.
2488 linkname = os.path.dirname(tarinfo.name) + "/" + tarinfo.linkname
2489 limit = None
2490 else:
2491 # Search the archive before the link, because a hard link is
2492 # just a reference to an already archived file.
2493 linkname = tarinfo.linkname
2494 limit = tarinfo
2495
2496 member = self._getmember(linkname, tarinfo=limit, normalize=True)
2497 if member is None:
2498 raise KeyError("linkname %r not found" % linkname)
2499 return member
2500
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002501 def __iter__(self):
2502 """Provide an iterator object.
2503 """
2504 if self._loaded:
2505 return iter(self.members)
2506 else:
2507 return TarIter(self)
2508
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002509 def _dbg(self, level, msg):
2510 """Write debugging output to sys.stderr.
2511 """
2512 if level <= self.debug:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002513 print(msg, file=sys.stderr)
Lars Gustäbel01385812010-03-03 12:08:54 +00002514
2515 def __enter__(self):
2516 self._check()
2517 return self
2518
2519 def __exit__(self, type, value, traceback):
2520 if type is None:
2521 self.close()
2522 else:
2523 # An exception occurred. We must not call close() because
2524 # it would try to write end-of-archive blocks and padding.
2525 if not self._extfileobj:
2526 self.fileobj.close()
2527 self.closed = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002528# class TarFile
2529
2530class TarIter:
2531 """Iterator Class.
2532
2533 for tarinfo in TarFile(...):
2534 suite...
2535 """
2536
2537 def __init__(self, tarfile):
2538 """Construct a TarIter object.
2539 """
2540 self.tarfile = tarfile
Martin v. Löwis637431b2005-03-03 23:12:42 +00002541 self.index = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002542 def __iter__(self):
2543 """Return iterator object.
2544 """
2545 return self
Georg Brandla18af4e2007-04-21 15:47:16 +00002546 def __next__(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002547 """Return the next item using TarFile's next() method.
2548 When all members have been read, set TarFile as _loaded.
2549 """
Martin v. Löwis637431b2005-03-03 23:12:42 +00002550 # Fix for SF #1100429: Under rare circumstances it can
2551 # happen that getmembers() is called during iteration,
2552 # which will cause TarIter to stop prematurely.
2553 if not self.tarfile._loaded:
2554 tarinfo = self.tarfile.next()
2555 if not tarinfo:
2556 self.tarfile._loaded = True
2557 raise StopIteration
2558 else:
2559 try:
2560 tarinfo = self.tarfile.members[self.index]
2561 except IndexError:
2562 raise StopIteration
2563 self.index += 1
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002564 return tarinfo
2565
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002566#--------------------
2567# exported functions
2568#--------------------
2569def is_tarfile(name):
2570 """Return True if name points to a tar archive that we
2571 are able to handle, else return False.
2572 """
2573 try:
2574 t = open(name)
2575 t.close()
2576 return True
2577 except TarError:
2578 return False
2579
Guido van Rossume7ba4952007-06-06 23:52:48 +00002580bltn_open = open
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002581open = TarFile.open