blob: e3380a2d6b3f5f0534ce3582169f22c849e3b5c0 [file] [log] [blame]
Benjamin Peterson90f5ba52010-03-11 22:53:45 +00001#!/usr/bin/env python3
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
Christian Heimes9c1257e2007-11-04 11:37:22 +00005# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00006# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
Guido van Rossumd8faa362007-04-27 19:54:29 +000032version = "0.9.0"
Guido van Rossum98297ee2007-11-06 21:34:58 +000033__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)"
Senthil Kumaran7c9719c2011-07-28 22:32:49 +080034__date__ = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $"
35__cvsid__ = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $"
Guido van Rossum98297ee2007-11-06 21:34:58 +000036__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000037
38#---------
39# Imports
40#---------
41import sys
42import os
Eli Bendersky74c503b2012-01-03 06:26:13 +020043import io
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000044import shutil
45import stat
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000046import time
47import struct
Thomas Wouters89f507f2006-12-13 04:49:30 +000048import copy
Guido van Rossumd8faa362007-04-27 19:54:29 +000049import re
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000050
51try:
52 import grp, pwd
53except ImportError:
54 grp = pwd = None
55
Brian Curtin16633fa2010-07-09 13:54:27 +000056# os.symlink on Windows prior to 6.0 raises NotImplementedError
57symlink_exception = (AttributeError, NotImplementedError)
58try:
59 # WindowsError (1314) will be raised if the caller does not hold the
60 # SeCreateSymbolicLinkPrivilege privilege
61 symlink_exception += (WindowsError,)
62except NameError:
63 pass
64
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000065# from tarfile import *
66__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
67
Georg Brandl1a3284e2007-12-02 09:40:06 +000068from builtins import open as _open # Since 'open' is TarFile.open
Guido van Rossum8f78fe92006-08-24 04:03:53 +000069
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000070#---------------------------------------------------------
71# tar constants
72#---------------------------------------------------------
Lars Gustäbelb506dc32007-08-07 18:36:16 +000073NUL = b"\0" # the null character
Guido van Rossumd8faa362007-04-27 19:54:29 +000074BLOCKSIZE = 512 # length of processing blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000075RECORDSIZE = BLOCKSIZE * 20 # length of records
Lars Gustäbelb506dc32007-08-07 18:36:16 +000076GNU_MAGIC = b"ustar \0" # magic gnu tar string
77POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000078
Guido van Rossumd8faa362007-04-27 19:54:29 +000079LENGTH_NAME = 100 # maximum length of a filename
80LENGTH_LINK = 100 # maximum length of a linkname
81LENGTH_PREFIX = 155 # maximum length of the prefix field
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000082
Lars Gustäbelb506dc32007-08-07 18:36:16 +000083REGTYPE = b"0" # regular file
84AREGTYPE = b"\0" # regular file
85LNKTYPE = b"1" # link (inside tarfile)
86SYMTYPE = b"2" # symbolic link
87CHRTYPE = b"3" # character special device
88BLKTYPE = b"4" # block special device
89DIRTYPE = b"5" # directory
90FIFOTYPE = b"6" # fifo special device
91CONTTYPE = b"7" # contiguous file
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000092
Lars Gustäbelb506dc32007-08-07 18:36:16 +000093GNUTYPE_LONGNAME = b"L" # GNU tar longname
94GNUTYPE_LONGLINK = b"K" # GNU tar longlink
95GNUTYPE_SPARSE = b"S" # GNU tar sparse file
Guido van Rossumd8faa362007-04-27 19:54:29 +000096
Lars Gustäbelb506dc32007-08-07 18:36:16 +000097XHDTYPE = b"x" # POSIX.1-2001 extended header
98XGLTYPE = b"g" # POSIX.1-2001 global header
99SOLARIS_XHDTYPE = b"X" # Solaris extended header
Guido van Rossumd8faa362007-04-27 19:54:29 +0000100
101USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
102GNU_FORMAT = 1 # GNU tar format
103PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
104DEFAULT_FORMAT = GNU_FORMAT
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000105
106#---------------------------------------------------------
107# tarfile constants
108#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000109# File types that tarfile supports:
110SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
111 SYMTYPE, DIRTYPE, FIFOTYPE,
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000112 CONTTYPE, CHRTYPE, BLKTYPE,
113 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
114 GNUTYPE_SPARSE)
115
Guido van Rossumd8faa362007-04-27 19:54:29 +0000116# File types that will be treated as a regular file.
117REGULAR_TYPES = (REGTYPE, AREGTYPE,
118 CONTTYPE, GNUTYPE_SPARSE)
119
120# File types that are part of the GNU tar format.
121GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
122 GNUTYPE_SPARSE)
123
124# Fields from a pax header that override a TarInfo attribute.
125PAX_FIELDS = ("path", "linkpath", "size", "mtime",
126 "uid", "gid", "uname", "gname")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000127
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000128# Fields from a pax header that are affected by hdrcharset.
129PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
130
Guido van Rossume7ba4952007-06-06 23:52:48 +0000131# Fields in a pax header that are numbers, all other fields
132# are treated as strings.
133PAX_NUMBER_FIELDS = {
134 "atime": float,
135 "ctime": float,
136 "mtime": float,
137 "uid": int,
138 "gid": int,
139 "size": int
140}
141
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000142#---------------------------------------------------------
143# Bits used in the mode field, values in octal.
144#---------------------------------------------------------
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000145S_IFLNK = 0o120000 # symbolic link
146S_IFREG = 0o100000 # regular file
147S_IFBLK = 0o060000 # block device
148S_IFDIR = 0o040000 # directory
149S_IFCHR = 0o020000 # character device
150S_IFIFO = 0o010000 # fifo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000151
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000152TSUID = 0o4000 # set UID on execution
153TSGID = 0o2000 # set GID on execution
154TSVTX = 0o1000 # reserved
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000155
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000156TUREAD = 0o400 # read by owner
157TUWRITE = 0o200 # write by owner
158TUEXEC = 0o100 # execute/search by owner
159TGREAD = 0o040 # read by group
160TGWRITE = 0o020 # write by group
161TGEXEC = 0o010 # execute/search by group
162TOREAD = 0o004 # read by other
163TOWRITE = 0o002 # write by other
164TOEXEC = 0o001 # execute/search by other
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000165
166#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000167# initialization
168#---------------------------------------------------------
Victor Stinner0f35e2c2010-06-11 23:46:47 +0000169if os.name in ("nt", "ce"):
170 ENCODING = "utf-8"
171else:
172 ENCODING = sys.getfilesystemencoding()
Guido van Rossumd8faa362007-04-27 19:54:29 +0000173
174#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000175# Some useful functions
176#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000177
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000178def stn(s, length, encoding, errors):
179 """Convert a string to a null-terminated bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000180 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000181 s = s.encode(encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000182 return s[:length] + (length - len(s)) * NUL
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000184def nts(s, encoding, errors):
185 """Convert a null-terminated bytes object to a string.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000186 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000187 p = s.find(b"\0")
188 if p != -1:
189 s = s[:p]
190 return s.decode(encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000191
Thomas Wouters477c8d52006-05-27 19:21:47 +0000192def nti(s):
193 """Convert a number field to a python number.
194 """
195 # There are two possible encodings for a number field, see
196 # itn() below.
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200197 if s[0] in (0o200, 0o377):
198 n = 0
199 for i in range(len(s) - 1):
200 n <<= 8
201 n += s[i + 1]
202 if s[0] == 0o377:
203 n = -(256 ** (len(s) - 1) - n)
204 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000205 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000206 n = int(nts(s, "ascii", "strict") or "0", 8)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000207 except ValueError:
Lars Gustäbel9520a432009-11-22 18:48:49 +0000208 raise InvalidHeaderError("invalid header")
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209 return n
210
Guido van Rossumd8faa362007-04-27 19:54:29 +0000211def itn(n, digits=8, format=DEFAULT_FORMAT):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000212 """Convert a python number to a number field.
213 """
214 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
215 # octal digits followed by a null-byte, this allows values up to
216 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200217 # that if necessary. A leading 0o200 or 0o377 byte indicate this
218 # particular encoding, the following digits-1 bytes are a big-endian
219 # base-256 representation. This allows values up to (256**(digits-1))-1.
220 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
221 # number.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000222 if 0 <= n < 8 ** (digits - 1):
Lars Gustäbela280ca752007-08-28 07:34:33 +0000223 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200224 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
225 if n >= 0:
226 s = bytearray([0o200])
227 else:
228 s = bytearray([0o377])
229 n = 256 ** digits + n
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230
Guido van Rossum805365e2007-05-07 22:24:25 +0000231 for i in range(digits - 1):
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200232 s.insert(1, n & 0o377)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233 n >>= 8
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200234 else:
235 raise ValueError("overflow in number field")
236
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237 return s
238
239def calc_chksums(buf):
240 """Calculate the checksum for a member's header by summing up all
241 characters except for the chksum field which is treated as if
242 it was filled with spaces. According to the GNU tar sources,
243 some tars (Sun and NeXT) calculate chksum with signed char,
244 which will be different if there are chars in the buffer with
245 the high bit set. So we calculate two checksums, unsigned and
246 signed.
247 """
248 unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
249 signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
250 return unsigned_chksum, signed_chksum
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000251
252def copyfileobj(src, dst, length=None):
253 """Copy length bytes from fileobj src to fileobj dst.
254 If length is None, copy the entire content.
255 """
256 if length == 0:
257 return
258 if length is None:
259 shutil.copyfileobj(src, dst)
260 return
261
262 BUFSIZE = 16 * 1024
263 blocks, remainder = divmod(length, BUFSIZE)
Guido van Rossum805365e2007-05-07 22:24:25 +0000264 for b in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000265 buf = src.read(BUFSIZE)
266 if len(buf) < BUFSIZE:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000268 dst.write(buf)
269
270 if remainder != 0:
271 buf = src.read(remainder)
272 if len(buf) < remainder:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000274 dst.write(buf)
275 return
276
277filemode_table = (
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000278 ((S_IFLNK, "l"),
279 (S_IFREG, "-"),
280 (S_IFBLK, "b"),
281 (S_IFDIR, "d"),
282 (S_IFCHR, "c"),
283 (S_IFIFO, "p")),
284
285 ((TUREAD, "r"),),
286 ((TUWRITE, "w"),),
287 ((TUEXEC|TSUID, "s"),
288 (TSUID, "S"),
289 (TUEXEC, "x")),
290
291 ((TGREAD, "r"),),
292 ((TGWRITE, "w"),),
293 ((TGEXEC|TSGID, "s"),
294 (TSGID, "S"),
295 (TGEXEC, "x")),
296
297 ((TOREAD, "r"),),
298 ((TOWRITE, "w"),),
299 ((TOEXEC|TSVTX, "t"),
300 (TSVTX, "T"),
301 (TOEXEC, "x"))
302)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000303
304def filemode(mode):
305 """Convert a file's mode to a string of the form
306 -rwxrwxrwx.
307 Used by TarFile.list()
308 """
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000309 perm = []
310 for table in filemode_table:
311 for bit, char in table:
312 if mode & bit == bit:
313 perm.append(char)
314 break
315 else:
316 perm.append("-")
317 return "".join(perm)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000318
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000319class TarError(Exception):
320 """Base exception."""
321 pass
322class ExtractError(TarError):
323 """General exception for extract errors."""
324 pass
325class ReadError(TarError):
326 """Exception for unreadble tar archives."""
327 pass
328class CompressionError(TarError):
329 """Exception for unavailable compression methods."""
330 pass
331class StreamError(TarError):
332 """Exception for unsupported operations on stream-like TarFiles."""
333 pass
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000334class HeaderError(TarError):
Lars Gustäbel9520a432009-11-22 18:48:49 +0000335 """Base exception for header errors."""
336 pass
337class EmptyHeaderError(HeaderError):
338 """Exception for empty headers."""
339 pass
340class TruncatedHeaderError(HeaderError):
341 """Exception for truncated headers."""
342 pass
343class EOFHeaderError(HeaderError):
344 """Exception for end of file headers."""
345 pass
346class InvalidHeaderError(HeaderError):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000347 """Exception for invalid headers."""
348 pass
Lars Gustäbel9520a432009-11-22 18:48:49 +0000349class SubsequentHeaderError(HeaderError):
350 """Exception for missing and invalid extended headers."""
351 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000352
353#---------------------------
354# internal stream interface
355#---------------------------
356class _LowLevelFile:
357 """Low-level file object. Supports reading and writing.
358 It is used instead of a regular file object for streaming
359 access.
360 """
361
362 def __init__(self, name, mode):
363 mode = {
364 "r": os.O_RDONLY,
365 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
366 }[mode]
367 if hasattr(os, "O_BINARY"):
368 mode |= os.O_BINARY
Lars Gustäbeld6eb70b2010-04-29 15:37:02 +0000369 self.fd = os.open(name, mode, 0o666)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000370
371 def close(self):
372 os.close(self.fd)
373
374 def read(self, size):
375 return os.read(self.fd, size)
376
377 def write(self, s):
378 os.write(self.fd, s)
379
380class _Stream:
381 """Class that serves as an adapter between TarFile and
382 a stream-like object. The stream-like object only
383 needs to have a read() or write() method and is accessed
384 blockwise. Use of gzip or bzip2 compression is possible.
385 A stream-like object could be for example: sys.stdin,
386 sys.stdout, a socket, a tape device etc.
387
388 _Stream is intended to be used only internally.
389 """
390
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000391 def __init__(self, name, mode, comptype, fileobj, bufsize):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000392 """Construct a _Stream object.
393 """
394 self._extfileobj = True
395 if fileobj is None:
396 fileobj = _LowLevelFile(name, mode)
397 self._extfileobj = False
398
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000399 if comptype == '*':
400 # Enable transparent compression detection for the
401 # stream interface
402 fileobj = _StreamProxy(fileobj)
403 comptype = fileobj.getcomptype()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000404
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000405 self.name = name or ""
406 self.mode = mode
407 self.comptype = comptype
408 self.fileobj = fileobj
409 self.bufsize = bufsize
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000410 self.buf = b""
Guido van Rossume2a383d2007-01-15 16:59:06 +0000411 self.pos = 0
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000412 self.closed = False
413
Antoine Pitrou605c2932010-09-23 20:15:14 +0000414 try:
415 if comptype == "gz":
416 try:
417 import zlib
418 except ImportError:
419 raise CompressionError("zlib module is not available")
420 self.zlib = zlib
421 self.crc = zlib.crc32(b"")
422 if mode == "r":
423 self._init_read_gz()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100424 self.exception = zlib.error
Antoine Pitrou605c2932010-09-23 20:15:14 +0000425 else:
426 self._init_write_gz()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000427
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100428 elif comptype == "bz2":
Antoine Pitrou605c2932010-09-23 20:15:14 +0000429 try:
430 import bz2
431 except ImportError:
432 raise CompressionError("bz2 module is not available")
433 if mode == "r":
434 self.dbuf = b""
435 self.cmp = bz2.BZ2Decompressor()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100436 self.exception = IOError
Antoine Pitrou605c2932010-09-23 20:15:14 +0000437 else:
438 self.cmp = bz2.BZ2Compressor()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100439
440 elif comptype == "xz":
441 try:
442 import lzma
443 except ImportError:
444 raise CompressionError("lzma module is not available")
445 if mode == "r":
446 self.dbuf = b""
447 self.cmp = lzma.LZMADecompressor()
448 self.exception = lzma.LZMAError
449 else:
450 self.cmp = lzma.LZMACompressor()
451
452 elif comptype != "tar":
453 raise CompressionError("unknown compression type %r" % comptype)
454
Antoine Pitrou605c2932010-09-23 20:15:14 +0000455 except:
456 if not self._extfileobj:
457 self.fileobj.close()
458 self.closed = True
459 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000460
461 def __del__(self):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000462 if hasattr(self, "closed") and not self.closed:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000463 self.close()
464
465 def _init_write_gz(self):
466 """Initialize for writing with gzip compression.
467 """
468 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
469 -self.zlib.MAX_WBITS,
470 self.zlib.DEF_MEM_LEVEL,
471 0)
Guido van Rossume2a383d2007-01-15 16:59:06 +0000472 timestamp = struct.pack("<L", int(time.time()))
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000473 self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000474 if self.name.endswith(".gz"):
475 self.name = self.name[:-3]
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000476 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
477 self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000478
479 def write(self, s):
480 """Write string s to the stream.
481 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000482 if self.comptype == "gz":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000483 self.crc = self.zlib.crc32(s, self.crc)
484 self.pos += len(s)
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000485 if self.comptype != "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000486 s = self.cmp.compress(s)
487 self.__write(s)
488
489 def __write(self, s):
490 """Write string s to the stream if a whole new block
491 is ready to be written.
492 """
493 self.buf += s
494 while len(self.buf) > self.bufsize:
495 self.fileobj.write(self.buf[:self.bufsize])
496 self.buf = self.buf[self.bufsize:]
497
498 def close(self):
499 """Close the _Stream object. No operation should be
500 done on it afterwards.
501 """
502 if self.closed:
503 return
504
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000505 if self.mode == "w" and self.comptype != "tar":
Martin v. Löwisc234a522004-08-22 21:28:33 +0000506 self.buf += self.cmp.flush()
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000507
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000508 if self.mode == "w" and self.buf:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000509 self.fileobj.write(self.buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000510 self.buf = b""
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000511 if self.comptype == "gz":
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000512 # The native zlib crc is an unsigned 32-bit integer, but
513 # the Python wrapper implicitly casts that to a signed C
514 # long. So, on a 32-bit box self.crc may "look negative",
515 # while the same crc on a 64-bit box may "look positive".
516 # To avoid irksome warnings from the `struct` module, force
517 # it to look positive on all boxes.
Guido van Rossume2a383d2007-01-15 16:59:06 +0000518 self.fileobj.write(struct.pack("<L", self.crc & 0xffffffff))
519 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000520
521 if not self._extfileobj:
522 self.fileobj.close()
523
524 self.closed = True
525
526 def _init_read_gz(self):
527 """Initialize for reading a gzip compressed fileobj.
528 """
529 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000530 self.dbuf = b""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000531
532 # taken from gzip.GzipFile with some alterations
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000533 if self.__read(2) != b"\037\213":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000534 raise ReadError("not a gzip file")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000535 if self.__read(1) != b"\010":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000536 raise CompressionError("unsupported compression method")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000537
538 flag = ord(self.__read(1))
539 self.__read(6)
540
541 if flag & 4:
542 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
543 self.read(xlen)
544 if flag & 8:
545 while True:
546 s = self.__read(1)
547 if not s or s == NUL:
548 break
549 if flag & 16:
550 while True:
551 s = self.__read(1)
552 if not s or s == NUL:
553 break
554 if flag & 2:
555 self.__read(2)
556
557 def tell(self):
558 """Return the stream's file pointer position.
559 """
560 return self.pos
561
562 def seek(self, pos=0):
563 """Set the stream's file pointer to pos. Negative seeking
564 is forbidden.
565 """
566 if pos - self.pos >= 0:
567 blocks, remainder = divmod(pos - self.pos, self.bufsize)
Guido van Rossum805365e2007-05-07 22:24:25 +0000568 for i in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000569 self.read(self.bufsize)
570 self.read(remainder)
571 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572 raise StreamError("seeking backwards is not allowed")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000573 return self.pos
574
575 def read(self, size=None):
576 """Return the next size number of bytes from the stream.
577 If size is not defined, return all bytes of the stream
578 up to EOF.
579 """
580 if size is None:
581 t = []
582 while True:
583 buf = self._read(self.bufsize)
584 if not buf:
585 break
586 t.append(buf)
587 buf = "".join(t)
588 else:
589 buf = self._read(size)
590 self.pos += len(buf)
591 return buf
592
593 def _read(self, size):
594 """Return size bytes from the stream.
595 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000596 if self.comptype == "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000597 return self.__read(size)
598
599 c = len(self.dbuf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000600 while c < size:
601 buf = self.__read(self.bufsize)
602 if not buf:
603 break
Guido van Rossumd8faa362007-04-27 19:54:29 +0000604 try:
605 buf = self.cmp.decompress(buf)
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100606 except self.exception:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000607 raise ReadError("invalid compressed data")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000608 self.dbuf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000609 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000610 buf = self.dbuf[:size]
611 self.dbuf = self.dbuf[size:]
612 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000613
614 def __read(self, size):
615 """Return size bytes from stream. If internal buffer is empty,
616 read another block from the stream.
617 """
618 c = len(self.buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000619 while c < size:
620 buf = self.fileobj.read(self.bufsize)
621 if not buf:
622 break
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000623 self.buf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000624 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000625 buf = self.buf[:size]
626 self.buf = self.buf[size:]
627 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000628# class _Stream
629
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000630class _StreamProxy(object):
631 """Small proxy class that enables transparent compression
632 detection for the Stream interface (mode 'r|*').
633 """
634
635 def __init__(self, fileobj):
636 self.fileobj = fileobj
637 self.buf = self.fileobj.read(BLOCKSIZE)
638
639 def read(self, size):
640 self.read = self.fileobj.read
641 return self.buf
642
643 def getcomptype(self):
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100644 if self.buf.startswith(b"\x1f\x8b\x08"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000645 return "gz"
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100646 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000647 return "bz2"
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100648 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
649 return "xz"
650 else:
651 return "tar"
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000652
653 def close(self):
654 self.fileobj.close()
655# class StreamProxy
656
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000657#------------------------
658# Extraction file object
659#------------------------
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000660class _FileInFile(object):
661 """A thin wrapper around an existing file object that
662 provides a part of its data as an individual file
663 object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000664 """
665
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000666 def __init__(self, fileobj, offset, size, blockinfo=None):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000667 self.fileobj = fileobj
668 self.offset = offset
669 self.size = size
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000670 self.position = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000671
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000672 if blockinfo is None:
673 blockinfo = [(0, size)]
674
675 # Construct a map with data and zero blocks.
676 self.map_index = 0
677 self.map = []
678 lastpos = 0
679 realpos = self.offset
680 for offset, size in blockinfo:
681 if offset > lastpos:
682 self.map.append((False, lastpos, offset, None))
683 self.map.append((True, offset, offset + size, realpos))
684 realpos += size
685 lastpos = offset + size
686 if lastpos < self.size:
687 self.map.append((False, lastpos, self.size, None))
688
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000689 def seekable(self):
690 if not hasattr(self.fileobj, "seekable"):
691 # XXX gzip.GzipFile and bz2.BZ2File
692 return True
693 return self.fileobj.seekable()
694
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000695 def tell(self):
696 """Return the current file position.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000697 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000698 return self.position
699
700 def seek(self, position):
701 """Seek to a position in the file.
702 """
703 self.position = position
704
705 def read(self, size=None):
706 """Read data from the file.
707 """
708 if size is None:
709 size = self.size - self.position
710 else:
711 size = min(size, self.size - self.position)
712
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000713 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000714 while size > 0:
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000715 while True:
716 data, start, stop, offset = self.map[self.map_index]
717 if start <= self.position < stop:
718 break
719 else:
720 self.map_index += 1
721 if self.map_index == len(self.map):
722 self.map_index = 0
723 length = min(size, stop - self.position)
724 if data:
Lars Gustäbeldd071042011-02-23 11:42:22 +0000725 self.fileobj.seek(offset + (self.position - start))
726 buf += self.fileobj.read(length)
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000727 else:
728 buf += NUL * length
729 size -= length
730 self.position += length
731 return buf
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000732#class _FileInFile
733
734
735class ExFileObject(object):
736 """File-like object for reading an archive member.
737 Is returned by TarFile.extractfile().
738 """
739 blocksize = 1024
740
741 def __init__(self, tarfile, tarinfo):
742 self.fileobj = _FileInFile(tarfile.fileobj,
743 tarinfo.offset_data,
744 tarinfo.size,
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000745 tarinfo.sparse)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000746 self.name = tarinfo.name
747 self.mode = "r"
748 self.closed = False
749 self.size = tarinfo.size
750
751 self.position = 0
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000752 self.buffer = b""
753
754 def readable(self):
755 return True
756
757 def writable(self):
758 return False
759
760 def seekable(self):
761 return self.fileobj.seekable()
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000762
763 def read(self, size=None):
764 """Read at most size bytes from the file. If size is not
765 present or None, read all data until EOF is reached.
766 """
767 if self.closed:
768 raise ValueError("I/O operation on closed file")
769
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000770 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000771 if self.buffer:
772 if size is None:
773 buf = self.buffer
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000774 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000775 else:
776 buf = self.buffer[:size]
777 self.buffer = self.buffer[size:]
778
779 if size is None:
780 buf += self.fileobj.read()
781 else:
782 buf += self.fileobj.read(size - len(buf))
783
784 self.position += len(buf)
785 return buf
786
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000787 # XXX TextIOWrapper uses the read1() method.
788 read1 = read
789
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000790 def readline(self, size=-1):
791 """Read one entire line from the file. If size is present
792 and non-negative, return a string with at most that
793 size, which may be an incomplete line.
794 """
795 if self.closed:
796 raise ValueError("I/O operation on closed file")
797
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000798 pos = self.buffer.find(b"\n") + 1
799 if pos == 0:
800 # no newline found.
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000801 while True:
802 buf = self.fileobj.read(self.blocksize)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000803 self.buffer += buf
804 if not buf or b"\n" in buf:
805 pos = self.buffer.find(b"\n") + 1
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000806 if pos == 0:
807 # no newline found.
808 pos = len(self.buffer)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000809 break
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000810
811 if size != -1:
812 pos = min(size, pos)
813
814 buf = self.buffer[:pos]
815 self.buffer = self.buffer[pos:]
816 self.position += len(buf)
817 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000818
819 def readlines(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000820 """Return a list with all remaining lines.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000821 """
822 result = []
823 while True:
824 line = self.readline()
825 if not line: break
826 result.append(line)
827 return result
828
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000829 def tell(self):
830 """Return the current file position.
831 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000832 if self.closed:
833 raise ValueError("I/O operation on closed file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000834
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000835 return self.position
836
Eli Bendersky74c503b2012-01-03 06:26:13 +0200837 def seek(self, pos, whence=io.SEEK_SET):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000838 """Seek to a position in the file.
839 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000840 if self.closed:
841 raise ValueError("I/O operation on closed file")
842
Eli Bendersky74c503b2012-01-03 06:26:13 +0200843 if whence == io.SEEK_SET:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000844 self.position = min(max(pos, 0), self.size)
Eli Bendersky74c503b2012-01-03 06:26:13 +0200845 elif whence == io.SEEK_CUR:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000846 if pos < 0:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000847 self.position = max(self.position + pos, 0)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000848 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000849 self.position = min(self.position + pos, self.size)
Eli Bendersky74c503b2012-01-03 06:26:13 +0200850 elif whence == io.SEEK_END:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000851 self.position = max(min(self.size + pos, self.size), 0)
852 else:
853 raise ValueError("Invalid argument")
854
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000855 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000856 self.fileobj.seek(self.position)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000857
858 def close(self):
859 """Close the file object.
860 """
861 self.closed = True
Martin v. Löwisdf241532005-03-03 08:17:42 +0000862
863 def __iter__(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000864 """Get an iterator over the file's lines.
Martin v. Löwisdf241532005-03-03 08:17:42 +0000865 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000866 while True:
867 line = self.readline()
868 if not line:
869 break
870 yield line
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000871#class ExFileObject
872
873#------------------
874# Exported Classes
875#------------------
876class TarInfo(object):
877 """Informational class which holds the details about an
878 archive member given by a tar header block.
879 TarInfo objects are returned by TarFile.getmember(),
880 TarFile.getmembers() and TarFile.gettarinfo() and are
881 usually created internally.
882 """
883
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000884 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
885 "chksum", "type", "linkname", "uname", "gname",
886 "devmajor", "devminor",
887 "offset", "offset_data", "pax_headers", "sparse",
888 "tarfile", "_sparse_structs", "_link_target")
889
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000890 def __init__(self, name=""):
891 """Construct a TarInfo object. name is the optional name
892 of the member.
893 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000894 self.name = name # member name
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000895 self.mode = 0o644 # file permissions
Thomas Wouters477c8d52006-05-27 19:21:47 +0000896 self.uid = 0 # user id
897 self.gid = 0 # group id
898 self.size = 0 # file size
899 self.mtime = 0 # modification time
900 self.chksum = 0 # header checksum
901 self.type = REGTYPE # member type
902 self.linkname = "" # link name
Lars Gustäbel331b8002010-10-04 15:18:47 +0000903 self.uname = "" # user name
904 self.gname = "" # group name
Thomas Wouters477c8d52006-05-27 19:21:47 +0000905 self.devmajor = 0 # device major number
906 self.devminor = 0 # device minor number
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000907
Thomas Wouters477c8d52006-05-27 19:21:47 +0000908 self.offset = 0 # the tar header starts here
909 self.offset_data = 0 # the file's data starts here
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000910
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000911 self.sparse = None # sparse member information
Guido van Rossumd8faa362007-04-27 19:54:29 +0000912 self.pax_headers = {} # pax header information
913
914 # In pax headers the "name" and "linkname" field are called
915 # "path" and "linkpath".
916 def _getpath(self):
917 return self.name
918 def _setpath(self, name):
919 self.name = name
920 path = property(_getpath, _setpath)
921
922 def _getlinkpath(self):
923 return self.linkname
924 def _setlinkpath(self, linkname):
925 self.linkname = linkname
926 linkpath = property(_getlinkpath, _setlinkpath)
927
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000928 def __repr__(self):
929 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
930
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000931 def get_info(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000932 """Return the TarInfo's attributes as a dictionary.
933 """
934 info = {
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000935 "name": self.name,
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000936 "mode": self.mode & 0o7777,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000937 "uid": self.uid,
938 "gid": self.gid,
939 "size": self.size,
940 "mtime": self.mtime,
941 "chksum": self.chksum,
942 "type": self.type,
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000943 "linkname": self.linkname,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000944 "uname": self.uname,
945 "gname": self.gname,
946 "devmajor": self.devmajor,
947 "devminor": self.devminor
948 }
949
950 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
951 info["name"] += "/"
952
953 return info
954
Victor Stinnerde629d42010-05-05 21:43:57 +0000955 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000956 """Return a tar header as a string of 512 byte blocks.
957 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000958 info = self.get_info()
Guido van Rossume7ba4952007-06-06 23:52:48 +0000959
Guido van Rossumd8faa362007-04-27 19:54:29 +0000960 if format == USTAR_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000961 return self.create_ustar_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000962 elif format == GNU_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000963 return self.create_gnu_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000964 elif format == PAX_FORMAT:
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000965 return self.create_pax_header(info, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000966 else:
967 raise ValueError("invalid format")
968
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000969 def create_ustar_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000970 """Return the object as a ustar header block.
971 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000972 info["magic"] = POSIX_MAGIC
973
974 if len(info["linkname"]) > LENGTH_LINK:
975 raise ValueError("linkname is too long")
976
977 if len(info["name"]) > LENGTH_NAME:
978 info["prefix"], info["name"] = self._posix_split_name(info["name"])
979
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000980 return self._create_header(info, USTAR_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000981
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000982 def create_gnu_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000983 """Return the object as a GNU header block sequence.
984 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000985 info["magic"] = GNU_MAGIC
986
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000987 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +0000988 if len(info["linkname"]) > LENGTH_LINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000989 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000990
991 if len(info["name"]) > LENGTH_NAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000992 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000993
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000994 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000995
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000996 def create_pax_header(self, info, encoding):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000997 """Return the object as a ustar header block. If it cannot be
998 represented this way, prepend a pax extended header sequence
999 with supplement information.
1000 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001001 info["magic"] = POSIX_MAGIC
1002 pax_headers = self.pax_headers.copy()
1003
1004 # Test string fields for values that exceed the field length or cannot
1005 # be represented in ASCII encoding.
1006 for name, hname, length in (
1007 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
1008 ("uname", "uname", 32), ("gname", "gname", 32)):
1009
Guido van Rossume7ba4952007-06-06 23:52:48 +00001010 if hname in pax_headers:
1011 # The pax header has priority.
1012 continue
1013
Guido van Rossumd8faa362007-04-27 19:54:29 +00001014 # Try to encode the string as ASCII.
1015 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001016 info[name].encode("ascii", "strict")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001017 except UnicodeEncodeError:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001018 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001019 continue
1020
Guido van Rossume7ba4952007-06-06 23:52:48 +00001021 if len(info[name]) > length:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001022 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001023
1024 # Test number fields for values that exceed the field limit or values
1025 # that like to be stored as float.
1026 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001027 if name in pax_headers:
1028 # The pax header has priority. Avoid overflow.
1029 info[name] = 0
1030 continue
1031
Guido van Rossumd8faa362007-04-27 19:54:29 +00001032 val = info[name]
1033 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001034 pax_headers[name] = str(val)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001035 info[name] = 0
1036
Guido van Rossume7ba4952007-06-06 23:52:48 +00001037 # Create a pax extended header if necessary.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001038 if pax_headers:
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001039 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001040 else:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001041 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +00001042
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001043 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001044
1045 @classmethod
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001046 def create_pax_global_header(cls, pax_headers):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001047 """Return the object as a pax global header block sequence.
1048 """
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001049 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001050
1051 def _posix_split_name(self, name):
1052 """Split a name longer than 100 chars into a prefix
1053 and a name part.
1054 """
1055 prefix = name[:LENGTH_PREFIX + 1]
1056 while prefix and prefix[-1] != "/":
1057 prefix = prefix[:-1]
1058
1059 name = name[len(prefix):]
1060 prefix = prefix[:-1]
1061
1062 if not prefix or len(name) > LENGTH_NAME:
1063 raise ValueError("name is too long")
1064 return prefix, name
1065
1066 @staticmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001067 def _create_header(info, format, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001068 """Return a header block. info is a dictionary with file
1069 information, format must be one of the *_FORMAT constants.
1070 """
1071 parts = [
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001072 stn(info.get("name", ""), 100, encoding, errors),
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001073 itn(info.get("mode", 0) & 0o7777, 8, format),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001074 itn(info.get("uid", 0), 8, format),
1075 itn(info.get("gid", 0), 8, format),
1076 itn(info.get("size", 0), 12, format),
1077 itn(info.get("mtime", 0), 12, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001078 b" ", # checksum field
Guido van Rossumd8faa362007-04-27 19:54:29 +00001079 info.get("type", REGTYPE),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001080 stn(info.get("linkname", ""), 100, encoding, errors),
1081 info.get("magic", POSIX_MAGIC),
Lars Gustäbel331b8002010-10-04 15:18:47 +00001082 stn(info.get("uname", ""), 32, encoding, errors),
1083 stn(info.get("gname", ""), 32, encoding, errors),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001084 itn(info.get("devmajor", 0), 8, format),
1085 itn(info.get("devminor", 0), 8, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001086 stn(info.get("prefix", ""), 155, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001087 ]
1088
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001089 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001090 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
Lars Gustäbela280ca752007-08-28 07:34:33 +00001091 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001092 return buf
1093
1094 @staticmethod
1095 def _create_payload(payload):
1096 """Return the string payload filled with zero bytes
1097 up to the next 512 byte border.
1098 """
1099 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1100 if remainder > 0:
1101 payload += (BLOCKSIZE - remainder) * NUL
1102 return payload
1103
1104 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001105 def _create_gnu_long_header(cls, name, type, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001106 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1107 for name.
1108 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001109 name = name.encode(encoding, errors) + NUL
Guido van Rossumd8faa362007-04-27 19:54:29 +00001110
1111 info = {}
1112 info["name"] = "././@LongLink"
1113 info["type"] = type
1114 info["size"] = len(name)
1115 info["magic"] = GNU_MAGIC
1116
1117 # create extended header + name blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001118 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001119 cls._create_payload(name)
1120
1121 @classmethod
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001122 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1123 """Return a POSIX.1-2008 extended or global header sequence
Guido van Rossumd8faa362007-04-27 19:54:29 +00001124 that contains a list of keyword, value pairs. The values
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001125 must be strings.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001126 """
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001127 # Check if one of the fields contains surrogate characters and thereby
1128 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1129 binary = False
1130 for keyword, value in pax_headers.items():
1131 try:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001132 value.encode("utf-8", "strict")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001133 except UnicodeEncodeError:
1134 binary = True
1135 break
1136
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001137 records = b""
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001138 if binary:
1139 # Put the hdrcharset field at the beginning of the header.
1140 records += b"21 hdrcharset=BINARY\n"
1141
Guido van Rossumd8faa362007-04-27 19:54:29 +00001142 for keyword, value in pax_headers.items():
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001143 keyword = keyword.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001144 if binary:
1145 # Try to restore the original byte representation of `value'.
1146 # Needless to say, that the encoding must match the string.
1147 value = value.encode(encoding, "surrogateescape")
1148 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001149 value = value.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001150
Guido van Rossumd8faa362007-04-27 19:54:29 +00001151 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1152 n = p = 0
1153 while True:
1154 n = l + len(str(p))
1155 if n == p:
1156 break
1157 p = n
Lars Gustäbela280ca752007-08-28 07:34:33 +00001158 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
Guido van Rossumd8faa362007-04-27 19:54:29 +00001159
1160 # We use a hardcoded "././@PaxHeader" name like star does
1161 # instead of the one that POSIX recommends.
1162 info = {}
1163 info["name"] = "././@PaxHeader"
1164 info["type"] = type
1165 info["size"] = len(records)
1166 info["magic"] = POSIX_MAGIC
1167
1168 # Create pax header + record blocks.
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001169 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001170 cls._create_payload(records)
1171
Guido van Rossum75b64e62005-01-16 00:16:11 +00001172 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001173 def frombuf(cls, buf, encoding, errors):
1174 """Construct a TarInfo object from a 512 byte bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001175 """
Lars Gustäbel9520a432009-11-22 18:48:49 +00001176 if len(buf) == 0:
1177 raise EmptyHeaderError("empty header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001178 if len(buf) != BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001179 raise TruncatedHeaderError("truncated header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001180 if buf.count(NUL) == BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001181 raise EOFHeaderError("end of file header")
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001182
1183 chksum = nti(buf[148:156])
1184 if chksum not in calc_chksums(buf):
Lars Gustäbel9520a432009-11-22 18:48:49 +00001185 raise InvalidHeaderError("bad checksum")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001186
Guido van Rossumd8faa362007-04-27 19:54:29 +00001187 obj = cls()
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001188 obj.name = nts(buf[0:100], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001189 obj.mode = nti(buf[100:108])
1190 obj.uid = nti(buf[108:116])
1191 obj.gid = nti(buf[116:124])
1192 obj.size = nti(buf[124:136])
1193 obj.mtime = nti(buf[136:148])
1194 obj.chksum = chksum
1195 obj.type = buf[156:157]
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001196 obj.linkname = nts(buf[157:257], encoding, errors)
1197 obj.uname = nts(buf[265:297], encoding, errors)
1198 obj.gname = nts(buf[297:329], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001199 obj.devmajor = nti(buf[329:337])
1200 obj.devminor = nti(buf[337:345])
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001201 prefix = nts(buf[345:500], encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001202
Guido van Rossumd8faa362007-04-27 19:54:29 +00001203 # Old V7 tar format represents a directory as a regular
1204 # file with a trailing slash.
1205 if obj.type == AREGTYPE and obj.name.endswith("/"):
1206 obj.type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001207
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001208 # The old GNU sparse format occupies some of the unused
1209 # space in the buffer for up to 4 sparse structures.
1210 # Save the them for later processing in _proc_sparse().
1211 if obj.type == GNUTYPE_SPARSE:
1212 pos = 386
1213 structs = []
1214 for i in range(4):
1215 try:
1216 offset = nti(buf[pos:pos + 12])
1217 numbytes = nti(buf[pos + 12:pos + 24])
1218 except ValueError:
1219 break
1220 structs.append((offset, numbytes))
1221 pos += 24
1222 isextended = bool(buf[482])
1223 origsize = nti(buf[483:495])
1224 obj._sparse_structs = (structs, isextended, origsize)
1225
Guido van Rossumd8faa362007-04-27 19:54:29 +00001226 # Remove redundant slashes from directories.
1227 if obj.isdir():
1228 obj.name = obj.name.rstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001229
Guido van Rossumd8faa362007-04-27 19:54:29 +00001230 # Reconstruct a ustar longname.
1231 if prefix and obj.type not in GNU_TYPES:
1232 obj.name = prefix + "/" + obj.name
1233 return obj
1234
1235 @classmethod
1236 def fromtarfile(cls, tarfile):
1237 """Return the next TarInfo object from TarFile object
1238 tarfile.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001239 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001240 buf = tarfile.fileobj.read(BLOCKSIZE)
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001241 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001242 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1243 return obj._proc_member(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001244
Guido van Rossumd8faa362007-04-27 19:54:29 +00001245 #--------------------------------------------------------------------------
1246 # The following are methods that are called depending on the type of a
1247 # member. The entry point is _proc_member() which can be overridden in a
1248 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1249 # implement the following
1250 # operations:
1251 # 1. Set self.offset_data to the position where the data blocks begin,
1252 # if there is data that follows.
1253 # 2. Set tarfile.offset to the position where the next member's header will
1254 # begin.
1255 # 3. Return self or another valid TarInfo object.
1256 def _proc_member(self, tarfile):
1257 """Choose the right processing method depending on
1258 the type and call it.
Thomas Wouters89f507f2006-12-13 04:49:30 +00001259 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001260 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1261 return self._proc_gnulong(tarfile)
1262 elif self.type == GNUTYPE_SPARSE:
1263 return self._proc_sparse(tarfile)
1264 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1265 return self._proc_pax(tarfile)
1266 else:
1267 return self._proc_builtin(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001268
Guido van Rossumd8faa362007-04-27 19:54:29 +00001269 def _proc_builtin(self, tarfile):
1270 """Process a builtin type or an unknown type which
1271 will be treated as a regular file.
1272 """
1273 self.offset_data = tarfile.fileobj.tell()
1274 offset = self.offset_data
1275 if self.isreg() or self.type not in SUPPORTED_TYPES:
1276 # Skip the following data blocks.
1277 offset += self._block(self.size)
1278 tarfile.offset = offset
Thomas Wouters89f507f2006-12-13 04:49:30 +00001279
Guido van Rossume7ba4952007-06-06 23:52:48 +00001280 # Patch the TarInfo object with saved global
Guido van Rossumd8faa362007-04-27 19:54:29 +00001281 # header information.
Guido van Rossume7ba4952007-06-06 23:52:48 +00001282 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001283
1284 return self
1285
1286 def _proc_gnulong(self, tarfile):
1287 """Process the blocks that hold a GNU longname
1288 or longlink member.
1289 """
1290 buf = tarfile.fileobj.read(self._block(self.size))
1291
1292 # Fetch the next header and process it.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001293 try:
1294 next = self.fromtarfile(tarfile)
1295 except HeaderError:
1296 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001297
1298 # Patch the TarInfo object from the next header with
1299 # the longname information.
1300 next.offset = self.offset
1301 if self.type == GNUTYPE_LONGNAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001302 next.name = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001303 elif self.type == GNUTYPE_LONGLINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001304 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001305
1306 return next
1307
1308 def _proc_sparse(self, tarfile):
1309 """Process a GNU sparse header plus extra headers.
1310 """
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001311 # We already collected some sparse structures in frombuf().
1312 structs, isextended, origsize = self._sparse_structs
1313 del self._sparse_structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001314
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001315 # Collect sparse structures from extended header blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001316 while isextended:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001317 buf = tarfile.fileobj.read(BLOCKSIZE)
1318 pos = 0
Guido van Rossum805365e2007-05-07 22:24:25 +00001319 for i in range(21):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001320 try:
1321 offset = nti(buf[pos:pos + 12])
1322 numbytes = nti(buf[pos + 12:pos + 24])
1323 except ValueError:
1324 break
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001325 if offset and numbytes:
1326 structs.append((offset, numbytes))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001327 pos += 24
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001328 isextended = bool(buf[504])
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001329 self.sparse = structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001330
1331 self.offset_data = tarfile.fileobj.tell()
1332 tarfile.offset = self.offset_data + self._block(self.size)
1333 self.size = origsize
Guido van Rossumd8faa362007-04-27 19:54:29 +00001334 return self
1335
1336 def _proc_pax(self, tarfile):
1337 """Process an extended or global header as described in
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001338 POSIX.1-2008.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001339 """
1340 # Read the header information.
1341 buf = tarfile.fileobj.read(self._block(self.size))
1342
1343 # A pax header stores supplemental information for either
1344 # the following file (extended) or all following files
1345 # (global).
1346 if self.type == XGLTYPE:
1347 pax_headers = tarfile.pax_headers
1348 else:
1349 pax_headers = tarfile.pax_headers.copy()
1350
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001351 # Check if the pax header contains a hdrcharset field. This tells us
1352 # the encoding of the path, linkpath, uname and gname fields. Normally,
1353 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1354 # implementations are allowed to store them as raw binary strings if
1355 # the translation to UTF-8 fails.
1356 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1357 if match is not None:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001358 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001359
1360 # For the time being, we don't care about anything other than "BINARY".
1361 # The only other value that is currently allowed by the standard is
1362 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1363 hdrcharset = pax_headers.get("hdrcharset")
1364 if hdrcharset == "BINARY":
1365 encoding = tarfile.encoding
1366 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001367 encoding = "utf-8"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001368
Guido van Rossumd8faa362007-04-27 19:54:29 +00001369 # Parse pax header information. A record looks like that:
1370 # "%d %s=%s\n" % (length, keyword, value). length is the size
1371 # of the complete record including the length field itself and
Guido van Rossume7ba4952007-06-06 23:52:48 +00001372 # the newline. keyword and value are both UTF-8 encoded strings.
Antoine Pitroufd036452008-08-19 17:56:33 +00001373 regex = re.compile(br"(\d+) ([^=]+)=")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001374 pos = 0
1375 while True:
1376 match = regex.match(buf, pos)
1377 if not match:
1378 break
1379
1380 length, keyword = match.groups()
1381 length = int(length)
1382 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1383
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001384 # Normally, we could just use "utf-8" as the encoding and "strict"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001385 # as the error handler, but we better not take the risk. For
1386 # example, GNU tar <= 1.23 is known to store filenames it cannot
1387 # translate to UTF-8 as raw strings (unfortunately without a
1388 # hdrcharset=BINARY header).
1389 # We first try the strict standard encoding, and if that fails we
1390 # fall back on the user's encoding and error handler.
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001391 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001392 tarfile.errors)
1393 if keyword in PAX_NAME_FIELDS:
1394 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1395 tarfile.errors)
1396 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001397 value = self._decode_pax_field(value, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001398 tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001399
1400 pax_headers[keyword] = value
1401 pos += length
1402
Guido van Rossume7ba4952007-06-06 23:52:48 +00001403 # Fetch the next header.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001404 try:
1405 next = self.fromtarfile(tarfile)
1406 except HeaderError:
1407 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001408
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001409 # Process GNU sparse information.
1410 if "GNU.sparse.map" in pax_headers:
1411 # GNU extended sparse format version 0.1.
1412 self._proc_gnusparse_01(next, pax_headers)
1413
1414 elif "GNU.sparse.size" in pax_headers:
1415 # GNU extended sparse format version 0.0.
1416 self._proc_gnusparse_00(next, pax_headers, buf)
1417
1418 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1419 # GNU extended sparse format version 1.0.
1420 self._proc_gnusparse_10(next, pax_headers, tarfile)
1421
Guido van Rossume7ba4952007-06-06 23:52:48 +00001422 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001423 # Patch the TarInfo object with the extended header info.
1424 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1425 next.offset = self.offset
1426
1427 if "size" in pax_headers:
1428 # If the extended header replaces the size field,
1429 # we need to recalculate the offset where the next
1430 # header starts.
1431 offset = next.offset_data
1432 if next.isreg() or next.type not in SUPPORTED_TYPES:
1433 offset += next._block(next.size)
1434 tarfile.offset = offset
1435
1436 return next
1437
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001438 def _proc_gnusparse_00(self, next, pax_headers, buf):
1439 """Process a GNU tar extended sparse header, version 0.0.
1440 """
1441 offsets = []
1442 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1443 offsets.append(int(match.group(1)))
1444 numbytes = []
1445 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1446 numbytes.append(int(match.group(1)))
1447 next.sparse = list(zip(offsets, numbytes))
1448
1449 def _proc_gnusparse_01(self, next, pax_headers):
1450 """Process a GNU tar extended sparse header, version 0.1.
1451 """
1452 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1453 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1454
1455 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1456 """Process a GNU tar extended sparse header, version 1.0.
1457 """
1458 fields = None
1459 sparse = []
1460 buf = tarfile.fileobj.read(BLOCKSIZE)
1461 fields, buf = buf.split(b"\n", 1)
1462 fields = int(fields)
1463 while len(sparse) < fields * 2:
1464 if b"\n" not in buf:
1465 buf += tarfile.fileobj.read(BLOCKSIZE)
1466 number, buf = buf.split(b"\n", 1)
1467 sparse.append(int(number))
1468 next.offset_data = tarfile.fileobj.tell()
1469 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1470
Guido van Rossume7ba4952007-06-06 23:52:48 +00001471 def _apply_pax_info(self, pax_headers, encoding, errors):
1472 """Replace fields with supplemental information from a previous
1473 pax extended or global header.
1474 """
1475 for keyword, value in pax_headers.items():
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001476 if keyword == "GNU.sparse.name":
1477 setattr(self, "path", value)
1478 elif keyword == "GNU.sparse.size":
1479 setattr(self, "size", int(value))
1480 elif keyword == "GNU.sparse.realsize":
1481 setattr(self, "size", int(value))
1482 elif keyword in PAX_FIELDS:
1483 if keyword in PAX_NUMBER_FIELDS:
1484 try:
1485 value = PAX_NUMBER_FIELDS[keyword](value)
1486 except ValueError:
1487 value = 0
1488 if keyword == "path":
1489 value = value.rstrip("/")
1490 setattr(self, keyword, value)
Guido van Rossume7ba4952007-06-06 23:52:48 +00001491
1492 self.pax_headers = pax_headers.copy()
Guido van Rossumd8faa362007-04-27 19:54:29 +00001493
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001494 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1495 """Decode a single field from a pax record.
1496 """
1497 try:
1498 return value.decode(encoding, "strict")
1499 except UnicodeDecodeError:
1500 return value.decode(fallback_encoding, fallback_errors)
1501
Guido van Rossumd8faa362007-04-27 19:54:29 +00001502 def _block(self, count):
1503 """Round up a byte count by BLOCKSIZE and return it,
1504 e.g. _block(834) => 1024.
1505 """
1506 blocks, remainder = divmod(count, BLOCKSIZE)
1507 if remainder:
1508 blocks += 1
1509 return blocks * BLOCKSIZE
Thomas Wouters89f507f2006-12-13 04:49:30 +00001510
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001511 def isreg(self):
1512 return self.type in REGULAR_TYPES
1513 def isfile(self):
1514 return self.isreg()
1515 def isdir(self):
1516 return self.type == DIRTYPE
1517 def issym(self):
1518 return self.type == SYMTYPE
1519 def islnk(self):
1520 return self.type == LNKTYPE
1521 def ischr(self):
1522 return self.type == CHRTYPE
1523 def isblk(self):
1524 return self.type == BLKTYPE
1525 def isfifo(self):
1526 return self.type == FIFOTYPE
1527 def issparse(self):
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001528 return self.sparse is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001529 def isdev(self):
1530 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1531# class TarInfo
1532
1533class TarFile(object):
1534 """The TarFile Class provides an interface to tar archives.
1535 """
1536
1537 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1538
1539 dereference = False # If true, add content of linked file to the
1540 # tar file, else the link.
1541
1542 ignore_zeros = False # If true, skips empty or invalid blocks and
1543 # continues processing.
1544
Lars Gustäbel365aff32009-12-13 11:42:29 +00001545 errorlevel = 1 # If 0, fatal errors only appear in debug
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001546 # messages (if debug >= 0). If > 0, errors
1547 # are passed to the caller as exceptions.
1548
Guido van Rossumd8faa362007-04-27 19:54:29 +00001549 format = DEFAULT_FORMAT # The format to use when creating an archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001550
Guido van Rossume7ba4952007-06-06 23:52:48 +00001551 encoding = ENCODING # Encoding for 8-bit character strings.
1552
1553 errors = None # Error handler for unicode conversion.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001554
Guido van Rossumd8faa362007-04-27 19:54:29 +00001555 tarinfo = TarInfo # The default TarInfo class to use.
1556
1557 fileobject = ExFileObject # The default ExFileObject class to use.
1558
1559 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1560 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
Victor Stinnerde629d42010-05-05 21:43:57 +00001561 errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001562 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1563 read from an existing archive, 'a' to append data to an existing
1564 file or 'w' to create a new file overwriting an existing one. `mode'
1565 defaults to 'r'.
1566 If `fileobj' is given, it is used for reading or writing data. If it
1567 can be determined, `mode' is overridden by `fileobj's mode.
1568 `fileobj' is not closed, when TarFile is closed.
1569 """
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001570 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001571 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001572 self.mode = mode
1573 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001574
1575 if not fileobj:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001576 if self.mode == "a" and not os.path.exists(name):
Thomas Wouterscf297e42007-02-23 15:07:44 +00001577 # Create nonexistent files in append mode.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001578 self.mode = "w"
1579 self._mode = "wb"
Guido van Rossume7ba4952007-06-06 23:52:48 +00001580 fileobj = bltn_open(name, self._mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001581 self._extfileobj = False
1582 else:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001583 if name is None and hasattr(fileobj, "name"):
1584 name = fileobj.name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001585 if hasattr(fileobj, "mode"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001586 self._mode = fileobj.mode
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001587 self._extfileobj = True
Thomas Woutersed03b412007-08-28 21:37:11 +00001588 self.name = os.path.abspath(name) if name else None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001589 self.fileobj = fileobj
1590
Guido van Rossumd8faa362007-04-27 19:54:29 +00001591 # Init attributes.
1592 if format is not None:
1593 self.format = format
1594 if tarinfo is not None:
1595 self.tarinfo = tarinfo
1596 if dereference is not None:
1597 self.dereference = dereference
1598 if ignore_zeros is not None:
1599 self.ignore_zeros = ignore_zeros
1600 if encoding is not None:
1601 self.encoding = encoding
Victor Stinnerde629d42010-05-05 21:43:57 +00001602 self.errors = errors
Guido van Rossume7ba4952007-06-06 23:52:48 +00001603
1604 if pax_headers is not None and self.format == PAX_FORMAT:
1605 self.pax_headers = pax_headers
1606 else:
1607 self.pax_headers = {}
1608
Guido van Rossumd8faa362007-04-27 19:54:29 +00001609 if debug is not None:
1610 self.debug = debug
1611 if errorlevel is not None:
1612 self.errorlevel = errorlevel
1613
1614 # Init datastructures.
Thomas Wouters477c8d52006-05-27 19:21:47 +00001615 self.closed = False
1616 self.members = [] # list of members as TarInfo objects
1617 self._loaded = False # flag if all members have been read
Christian Heimesd8654cf2007-12-02 15:22:16 +00001618 self.offset = self.fileobj.tell()
1619 # current position in the archive file
Thomas Wouters477c8d52006-05-27 19:21:47 +00001620 self.inodes = {} # dictionary caching the inodes of
1621 # archive members already added
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001622
Lars Gustäbel7b465392009-11-18 20:29:25 +00001623 try:
1624 if self.mode == "r":
1625 self.firstmember = None
1626 self.firstmember = self.next()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001627
Lars Gustäbel7b465392009-11-18 20:29:25 +00001628 if self.mode == "a":
1629 # Move to the end of the archive,
1630 # before the first empty block.
Lars Gustäbel7b465392009-11-18 20:29:25 +00001631 while True:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001632 self.fileobj.seek(self.offset)
1633 try:
1634 tarinfo = self.tarinfo.fromtarfile(self)
1635 self.members.append(tarinfo)
1636 except EOFHeaderError:
1637 self.fileobj.seek(self.offset)
Lars Gustäbel7b465392009-11-18 20:29:25 +00001638 break
Lars Gustäbel9520a432009-11-22 18:48:49 +00001639 except HeaderError as e:
1640 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001641
Lars Gustäbel7b465392009-11-18 20:29:25 +00001642 if self.mode in "aw":
1643 self._loaded = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001644
Lars Gustäbel7b465392009-11-18 20:29:25 +00001645 if self.pax_headers:
1646 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1647 self.fileobj.write(buf)
1648 self.offset += len(buf)
1649 except:
1650 if not self._extfileobj:
1651 self.fileobj.close()
1652 self.closed = True
1653 raise
Guido van Rossumd8faa362007-04-27 19:54:29 +00001654
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001655 #--------------------------------------------------------------------------
1656 # Below are the classmethods which act as alternate constructors to the
1657 # TarFile class. The open() method is the only one that is needed for
1658 # public use; it is the "super"-constructor and is able to select an
1659 # adequate "sub"-constructor for a particular compression using the mapping
1660 # from OPEN_METH.
1661 #
1662 # This concept allows one to subclass TarFile without losing the comfort of
1663 # the super-constructor. A sub-constructor is registered and made available
1664 # by adding it to the mapping in OPEN_METH.
1665
Guido van Rossum75b64e62005-01-16 00:16:11 +00001666 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001667 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001668 """Open a tar archive for reading, writing or appending. Return
1669 an appropriate TarFile class.
1670
1671 mode:
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001672 'r' or 'r:*' open for reading with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001673 'r:' open for reading exclusively uncompressed
1674 'r:gz' open for reading with gzip compression
1675 'r:bz2' open for reading with bzip2 compression
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001676 'r:xz' open for reading with lzma compression
Thomas Wouterscf297e42007-02-23 15:07:44 +00001677 'a' or 'a:' open for appending, creating the file if necessary
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001678 'w' or 'w:' open for writing without compression
1679 'w:gz' open for writing with gzip compression
1680 'w:bz2' open for writing with bzip2 compression
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001681 'w:xz' open for writing with lzma compression
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001682
1683 'r|*' open a stream of tar blocks with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001684 'r|' open an uncompressed stream of tar blocks for reading
1685 'r|gz' open a gzip compressed stream of tar blocks
1686 'r|bz2' open a bzip2 compressed stream of tar blocks
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001687 'r|xz' open an lzma compressed stream of tar blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001688 'w|' open an uncompressed stream for writing
1689 'w|gz' open a gzip compressed stream for writing
1690 'w|bz2' open a bzip2 compressed stream for writing
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001691 'w|xz' open an lzma compressed stream for writing
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001692 """
1693
1694 if not name and not fileobj:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001695 raise ValueError("nothing to open")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001696
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001697 if mode in ("r", "r:*"):
1698 # Find out which *open() is appropriate for opening the file.
1699 for comptype in cls.OPEN_METH:
1700 func = getattr(cls, cls.OPEN_METH[comptype])
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001701 if fileobj is not None:
1702 saved_pos = fileobj.tell()
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001703 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001704 return func(name, "r", fileobj, **kwargs)
1705 except (ReadError, CompressionError) as e:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001706 if fileobj is not None:
1707 fileobj.seek(saved_pos)
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001708 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001709 raise ReadError("file could not be opened successfully")
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001710
1711 elif ":" in mode:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001712 filemode, comptype = mode.split(":", 1)
1713 filemode = filemode or "r"
1714 comptype = comptype or "tar"
1715
1716 # Select the *open() function according to
1717 # given compression.
1718 if comptype in cls.OPEN_METH:
1719 func = getattr(cls, cls.OPEN_METH[comptype])
1720 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001721 raise CompressionError("unknown compression type %r" % comptype)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001722 return func(name, filemode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001723
1724 elif "|" in mode:
1725 filemode, comptype = mode.split("|", 1)
1726 filemode = filemode or "r"
1727 comptype = comptype or "tar"
1728
1729 if filemode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001730 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001731
Antoine Pitrou605c2932010-09-23 20:15:14 +00001732 stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1733 try:
1734 t = cls(name, filemode, stream, **kwargs)
1735 except:
1736 stream.close()
1737 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001738 t._extfileobj = False
1739 return t
1740
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001741 elif mode in "aw":
Guido van Rossumd8faa362007-04-27 19:54:29 +00001742 return cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001743
Thomas Wouters477c8d52006-05-27 19:21:47 +00001744 raise ValueError("undiscernible mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001745
Guido van Rossum75b64e62005-01-16 00:16:11 +00001746 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001747 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001748 """Open uncompressed tar archive name for reading or writing.
1749 """
1750 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001751 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001752 return cls(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001753
Guido van Rossum75b64e62005-01-16 00:16:11 +00001754 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001755 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001756 """Open gzip compressed tar archive name for reading or writing.
1757 Appending is not allowed.
1758 """
1759 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001760 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001761
1762 try:
1763 import gzip
Neal Norwitz4ec68242003-04-11 03:05:56 +00001764 gzip.GzipFile
1765 except (ImportError, AttributeError):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001766 raise CompressionError("gzip module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001767
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001768 extfileobj = fileobj is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001769 try:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001770 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
1771 t = cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001772 except IOError:
Georg Brandl3abb3722011-08-13 11:48:12 +02001773 if not extfileobj and fileobj is not None:
Antoine Pitrou95f55602010-09-23 18:36:46 +00001774 fileobj.close()
Georg Brandl3abb3722011-08-13 11:48:12 +02001775 if fileobj is None:
1776 raise
Thomas Wouters477c8d52006-05-27 19:21:47 +00001777 raise ReadError("not a gzip file")
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001778 except:
Georg Brandl3abb3722011-08-13 11:48:12 +02001779 if not extfileobj and fileobj is not None:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001780 fileobj.close()
1781 raise
Antoine Pitrou95f55602010-09-23 18:36:46 +00001782 t._extfileobj = extfileobj
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001783 return t
1784
Guido van Rossum75b64e62005-01-16 00:16:11 +00001785 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001786 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001787 """Open bzip2 compressed tar archive name for reading or writing.
1788 Appending is not allowed.
1789 """
1790 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001791 raise ValueError("mode must be 'r' or 'w'.")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001792
1793 try:
1794 import bz2
1795 except ImportError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001796 raise CompressionError("bz2 module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001797
Lars Gustäbelbb44b732011-12-06 13:44:10 +01001798 fileobj = bz2.BZ2File(filename=name if fileobj is None else None,
1799 mode=mode, fileobj=fileobj, compresslevel=compresslevel)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001800
1801 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001802 t = cls.taropen(name, mode, fileobj, **kwargs)
Lars Gustäbel9520a432009-11-22 18:48:49 +00001803 except (IOError, EOFError):
Antoine Pitrou95f55602010-09-23 18:36:46 +00001804 fileobj.close()
Thomas Wouters477c8d52006-05-27 19:21:47 +00001805 raise ReadError("not a bzip2 file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001806 t._extfileobj = False
1807 return t
1808
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001809 @classmethod
Lars Gustäbelc5e11992012-01-18 14:01:17 +01001810 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001811 """Open lzma compressed tar archive name for reading or writing.
1812 Appending is not allowed.
1813 """
1814 if mode not in ("r", "w"):
1815 raise ValueError("mode must be 'r' or 'w'")
1816
1817 try:
1818 import lzma
1819 except ImportError:
1820 raise CompressionError("lzma module is not available")
1821
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001822 fileobj = lzma.LZMAFile(filename=name if fileobj is None else None,
1823 mode=mode, fileobj=fileobj, preset=preset)
1824
1825 try:
1826 t = cls.taropen(name, mode, fileobj, **kwargs)
1827 except (lzma.LZMAError, EOFError):
1828 fileobj.close()
1829 raise ReadError("not an lzma file")
1830 t._extfileobj = False
1831 return t
1832
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001833 # All *open() methods are registered here.
1834 OPEN_METH = {
1835 "tar": "taropen", # uncompressed tar
1836 "gz": "gzopen", # gzip compressed tar
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001837 "bz2": "bz2open", # bzip2 compressed tar
1838 "xz": "xzopen" # lzma compressed tar
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001839 }
1840
1841 #--------------------------------------------------------------------------
1842 # The public methods which TarFile provides:
1843
1844 def close(self):
1845 """Close the TarFile. In write-mode, two finishing zero blocks are
1846 appended to the archive.
1847 """
1848 if self.closed:
1849 return
1850
Guido van Rossumd8faa362007-04-27 19:54:29 +00001851 if self.mode in "aw":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001852 self.fileobj.write(NUL * (BLOCKSIZE * 2))
1853 self.offset += (BLOCKSIZE * 2)
1854 # fill up the end with zero-blocks
1855 # (like option -b20 for tar does)
1856 blocks, remainder = divmod(self.offset, RECORDSIZE)
1857 if remainder > 0:
1858 self.fileobj.write(NUL * (RECORDSIZE - remainder))
1859
1860 if not self._extfileobj:
1861 self.fileobj.close()
1862 self.closed = True
1863
1864 def getmember(self, name):
1865 """Return a TarInfo object for member `name'. If `name' can not be
1866 found in the archive, KeyError is raised. If a member occurs more
Mark Dickinson934896d2009-02-21 20:59:32 +00001867 than once in the archive, its last occurrence is assumed to be the
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001868 most up-to-date version.
1869 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001870 tarinfo = self._getmember(name)
1871 if tarinfo is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001872 raise KeyError("filename %r not found" % name)
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001873 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001874
1875 def getmembers(self):
1876 """Return the members of the archive as a list of TarInfo objects. The
1877 list has the same order as the members in the archive.
1878 """
1879 self._check()
1880 if not self._loaded: # if we want to obtain a list of
1881 self._load() # all members, we first have to
1882 # scan the whole archive.
1883 return self.members
1884
1885 def getnames(self):
1886 """Return the members of the archive as a list of their names. It has
1887 the same order as the list returned by getmembers().
1888 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001889 return [tarinfo.name for tarinfo in self.getmembers()]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001890
1891 def gettarinfo(self, name=None, arcname=None, fileobj=None):
1892 """Create a TarInfo object for either the file `name' or the file
1893 object `fileobj' (using os.fstat on its file descriptor). You can
1894 modify some of the TarInfo's attributes before you add it using
1895 addfile(). If given, `arcname' specifies an alternative name for the
1896 file in the archive.
1897 """
1898 self._check("aw")
1899
1900 # When fileobj is given, replace name by
1901 # fileobj's real name.
1902 if fileobj is not None:
1903 name = fileobj.name
1904
1905 # Building the name of the member in the archive.
1906 # Backward slashes are converted to forward slashes,
1907 # Absolute paths are turned to relative paths.
1908 if arcname is None:
1909 arcname = name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001910 drv, arcname = os.path.splitdrive(arcname)
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00001911 arcname = arcname.replace(os.sep, "/")
1912 arcname = arcname.lstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001913
1914 # Now, fill the TarInfo object with
1915 # information specific for the file.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001916 tarinfo = self.tarinfo()
1917 tarinfo.tarfile = self
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001918
1919 # Use os.stat or os.lstat, depending on platform
1920 # and if symlinks shall be resolved.
1921 if fileobj is None:
1922 if hasattr(os, "lstat") and not self.dereference:
1923 statres = os.lstat(name)
1924 else:
1925 statres = os.stat(name)
1926 else:
1927 statres = os.fstat(fileobj.fileno())
1928 linkname = ""
1929
1930 stmd = statres.st_mode
1931 if stat.S_ISREG(stmd):
1932 inode = (statres.st_ino, statres.st_dev)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001933 if not self.dereference and statres.st_nlink > 1 and \
1934 inode in self.inodes and arcname != self.inodes[inode]:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001935 # Is it a hardlink to an already
1936 # archived file?
1937 type = LNKTYPE
1938 linkname = self.inodes[inode]
1939 else:
1940 # The inode is added only if its valid.
1941 # For win32 it is always 0.
1942 type = REGTYPE
1943 if inode[0]:
1944 self.inodes[inode] = arcname
1945 elif stat.S_ISDIR(stmd):
1946 type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001947 elif stat.S_ISFIFO(stmd):
1948 type = FIFOTYPE
1949 elif stat.S_ISLNK(stmd):
1950 type = SYMTYPE
1951 linkname = os.readlink(name)
1952 elif stat.S_ISCHR(stmd):
1953 type = CHRTYPE
1954 elif stat.S_ISBLK(stmd):
1955 type = BLKTYPE
1956 else:
1957 return None
1958
1959 # Fill the TarInfo object with all
1960 # information we can get.
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001961 tarinfo.name = arcname
1962 tarinfo.mode = stmd
1963 tarinfo.uid = statres.st_uid
1964 tarinfo.gid = statres.st_gid
Lars Gustäbel2470ff12010-06-03 10:11:52 +00001965 if type == REGTYPE:
Martin v. Löwis61d77e02004-08-20 06:35:46 +00001966 tarinfo.size = statres.st_size
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001967 else:
Guido van Rossume2a383d2007-01-15 16:59:06 +00001968 tarinfo.size = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001969 tarinfo.mtime = statres.st_mtime
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001970 tarinfo.type = type
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001971 tarinfo.linkname = linkname
1972 if pwd:
1973 try:
1974 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1975 except KeyError:
1976 pass
1977 if grp:
1978 try:
1979 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1980 except KeyError:
1981 pass
1982
1983 if type in (CHRTYPE, BLKTYPE):
1984 if hasattr(os, "major") and hasattr(os, "minor"):
1985 tarinfo.devmajor = os.major(statres.st_rdev)
1986 tarinfo.devminor = os.minor(statres.st_rdev)
1987 return tarinfo
1988
1989 def list(self, verbose=True):
1990 """Print a table of contents to sys.stdout. If `verbose' is False, only
1991 the names of the members are printed. If it is True, an `ls -l'-like
1992 output is produced.
1993 """
1994 self._check()
1995
1996 for tarinfo in self:
1997 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001998 print(filemode(tarinfo.mode), end=' ')
1999 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2000 tarinfo.gname or tarinfo.gid), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002001 if tarinfo.ischr() or tarinfo.isblk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002002 print("%10s" % ("%d,%d" \
2003 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002004 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002005 print("%10d" % tarinfo.size, end=' ')
2006 print("%d-%02d-%02d %02d:%02d:%02d" \
2007 % time.localtime(tarinfo.mtime)[:6], end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002008
Guido van Rossumd8faa362007-04-27 19:54:29 +00002009 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002010
2011 if verbose:
2012 if tarinfo.issym():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002013 print("->", tarinfo.linkname, end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002014 if tarinfo.islnk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002015 print("link to", tarinfo.linkname, end=' ')
2016 print()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002017
Raymond Hettingera63a3122011-01-26 20:34:14 +00002018 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002019 """Add the file `name' to the archive. `name' may be any type of file
2020 (directory, fifo, symbolic link, etc.). If given, `arcname'
2021 specifies an alternative name for the file in the archive.
2022 Directories are added recursively by default. This can be avoided by
Guido van Rossum486364b2007-06-30 05:01:58 +00002023 setting `recursive' to False. `exclude' is a function that should
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002024 return True for each filename to be excluded. `filter' is a function
2025 that expects a TarInfo object argument and returns the changed
2026 TarInfo object, if it returns None the TarInfo object will be
2027 excluded from the archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002028 """
2029 self._check("aw")
2030
2031 if arcname is None:
2032 arcname = name
2033
Guido van Rossum486364b2007-06-30 05:01:58 +00002034 # Exclude pathnames.
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002035 if exclude is not None:
2036 import warnings
2037 warnings.warn("use the filter argument instead",
2038 DeprecationWarning, 2)
2039 if exclude(name):
2040 self._dbg(2, "tarfile: Excluded %r" % name)
2041 return
Guido van Rossum486364b2007-06-30 05:01:58 +00002042
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002043 # Skip if somebody tries to archive the archive...
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002044 if self.name is not None and os.path.abspath(name) == self.name:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002045 self._dbg(2, "tarfile: Skipped %r" % name)
2046 return
2047
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002048 self._dbg(1, name)
2049
2050 # Create a TarInfo object from the file.
2051 tarinfo = self.gettarinfo(name, arcname)
2052
2053 if tarinfo is None:
2054 self._dbg(1, "tarfile: Unsupported type %r" % name)
2055 return
2056
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002057 # Change or exclude the TarInfo object.
2058 if filter is not None:
2059 tarinfo = filter(tarinfo)
2060 if tarinfo is None:
2061 self._dbg(2, "tarfile: Excluded %r" % name)
2062 return
2063
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002064 # Append the tar header and data to the archive.
2065 if tarinfo.isreg():
Guido van Rossume7ba4952007-06-06 23:52:48 +00002066 f = bltn_open(name, "rb")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002067 self.addfile(tarinfo, f)
2068 f.close()
2069
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002070 elif tarinfo.isdir():
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002071 self.addfile(tarinfo)
2072 if recursive:
2073 for f in os.listdir(name):
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002074 self.add(os.path.join(name, f), os.path.join(arcname, f),
Raymond Hettingera63a3122011-01-26 20:34:14 +00002075 recursive, exclude, filter=filter)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002076
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002077 else:
2078 self.addfile(tarinfo)
2079
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002080 def addfile(self, tarinfo, fileobj=None):
2081 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2082 given, tarinfo.size bytes are read from it and added to the archive.
2083 You can create TarInfo objects using gettarinfo().
2084 On Windows platforms, `fileobj' should always be opened with mode
2085 'rb' to avoid irritation about the file size.
2086 """
2087 self._check("aw")
2088
Thomas Wouters89f507f2006-12-13 04:49:30 +00002089 tarinfo = copy.copy(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002090
Guido van Rossume7ba4952007-06-06 23:52:48 +00002091 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002092 self.fileobj.write(buf)
2093 self.offset += len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002094
2095 # If there's data to follow, append it.
2096 if fileobj is not None:
2097 copyfileobj(fileobj, self.fileobj, tarinfo.size)
2098 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
2099 if remainder > 0:
2100 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2101 blocks += 1
2102 self.offset += blocks * BLOCKSIZE
2103
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002104 self.members.append(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002105
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002106 def extractall(self, path=".", members=None):
2107 """Extract all members from the archive to the current working
2108 directory and set owner, modification time and permissions on
2109 directories afterwards. `path' specifies a different directory
2110 to extract to. `members' is optional and must be a subset of the
2111 list returned by getmembers().
2112 """
2113 directories = []
2114
2115 if members is None:
2116 members = self
2117
2118 for tarinfo in members:
2119 if tarinfo.isdir():
Christian Heimes2202f872008-02-06 14:31:34 +00002120 # Extract directories with a safe mode.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002121 directories.append(tarinfo)
Christian Heimes2202f872008-02-06 14:31:34 +00002122 tarinfo = copy.copy(tarinfo)
2123 tarinfo.mode = 0o700
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002124 # Do not set_attrs directories, as we will do that further down
2125 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002126
2127 # Reverse sort directories.
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +00002128 directories.sort(key=lambda a: a.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002129 directories.reverse()
2130
2131 # Set correct owner, mtime and filemode on directories.
2132 for tarinfo in directories:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002133 dirpath = os.path.join(path, tarinfo.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002134 try:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002135 self.chown(tarinfo, dirpath)
2136 self.utime(tarinfo, dirpath)
2137 self.chmod(tarinfo, dirpath)
Guido van Rossumb940e112007-01-10 16:19:56 +00002138 except ExtractError as e:
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002139 if self.errorlevel > 1:
2140 raise
2141 else:
2142 self._dbg(1, "tarfile: %s" % e)
2143
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002144 def extract(self, member, path="", set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002145 """Extract a member from the archive to the current working directory,
2146 using its full name. Its file information is extracted as accurately
2147 as possible. `member' may be a filename or a TarInfo object. You can
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002148 specify a different directory using `path'. File attributes (owner,
2149 mtime, mode) are set unless `set_attrs' is False.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002150 """
2151 self._check("r")
2152
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002153 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002154 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002155 else:
2156 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002157
Neal Norwitza4f651a2004-07-20 22:07:44 +00002158 # Prepare the link target for makelink().
2159 if tarinfo.islnk():
2160 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2161
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002162 try:
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002163 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2164 set_attrs=set_attrs)
Guido van Rossumb940e112007-01-10 16:19:56 +00002165 except EnvironmentError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002166 if self.errorlevel > 0:
2167 raise
2168 else:
2169 if e.filename is None:
2170 self._dbg(1, "tarfile: %s" % e.strerror)
2171 else:
2172 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
Guido van Rossumb940e112007-01-10 16:19:56 +00002173 except ExtractError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002174 if self.errorlevel > 1:
2175 raise
2176 else:
2177 self._dbg(1, "tarfile: %s" % e)
2178
2179 def extractfile(self, member):
2180 """Extract a member from the archive as a file object. `member' may be
2181 a filename or a TarInfo object. If `member' is a regular file, a
2182 file-like object is returned. If `member' is a link, a file-like
2183 object is constructed from the link's target. If `member' is none of
2184 the above, None is returned.
2185 The file-like object is read-only and provides the following
2186 methods: read(), readline(), readlines(), seek() and tell()
2187 """
2188 self._check("r")
2189
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002190 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002191 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002192 else:
2193 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002194
2195 if tarinfo.isreg():
2196 return self.fileobject(self, tarinfo)
2197
2198 elif tarinfo.type not in SUPPORTED_TYPES:
2199 # If a member's type is unknown, it is treated as a
2200 # regular file.
2201 return self.fileobject(self, tarinfo)
2202
2203 elif tarinfo.islnk() or tarinfo.issym():
2204 if isinstance(self.fileobj, _Stream):
2205 # A small but ugly workaround for the case that someone tries
2206 # to extract a (sym)link as a file-object from a non-seekable
2207 # stream of tar blocks.
Thomas Wouters477c8d52006-05-27 19:21:47 +00002208 raise StreamError("cannot extract (sym)link as file object")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002209 else:
Georg Brandl7eb4b7d2005-07-22 21:49:32 +00002210 # A (sym)link's file object is its target's file object.
Lars Gustäbel1b512722010-06-03 12:45:16 +00002211 return self.extractfile(self._find_link_target(tarinfo))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002212 else:
2213 # If there's no data associated with the member (directory, chrdev,
2214 # blkdev, etc.), return None instead of a file object.
2215 return None
2216
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002217 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002218 """Extract the TarInfo object tarinfo to a physical
2219 file called targetpath.
2220 """
2221 # Fetch the TarInfo object for the given name
2222 # and build the destination pathname, replacing
2223 # forward slashes to platform specific separators.
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002224 targetpath = targetpath.rstrip("/")
2225 targetpath = targetpath.replace("/", os.sep)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002226
2227 # Create all upper directories.
2228 upperdirs = os.path.dirname(targetpath)
2229 if upperdirs and not os.path.exists(upperdirs):
Christian Heimes2202f872008-02-06 14:31:34 +00002230 # Create directories that are not part of the archive with
2231 # default permissions.
Thomas Woutersb2137042007-02-01 18:02:27 +00002232 os.makedirs(upperdirs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002233
2234 if tarinfo.islnk() or tarinfo.issym():
2235 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2236 else:
2237 self._dbg(1, tarinfo.name)
2238
2239 if tarinfo.isreg():
2240 self.makefile(tarinfo, targetpath)
2241 elif tarinfo.isdir():
2242 self.makedir(tarinfo, targetpath)
2243 elif tarinfo.isfifo():
2244 self.makefifo(tarinfo, targetpath)
2245 elif tarinfo.ischr() or tarinfo.isblk():
2246 self.makedev(tarinfo, targetpath)
2247 elif tarinfo.islnk() or tarinfo.issym():
2248 self.makelink(tarinfo, targetpath)
2249 elif tarinfo.type not in SUPPORTED_TYPES:
2250 self.makeunknown(tarinfo, targetpath)
2251 else:
2252 self.makefile(tarinfo, targetpath)
2253
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002254 if set_attrs:
2255 self.chown(tarinfo, targetpath)
2256 if not tarinfo.issym():
2257 self.chmod(tarinfo, targetpath)
2258 self.utime(tarinfo, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002259
2260 #--------------------------------------------------------------------------
2261 # Below are the different file methods. They are called via
2262 # _extract_member() when extract() is called. They can be replaced in a
2263 # subclass to implement other functionality.
2264
2265 def makedir(self, tarinfo, targetpath):
2266 """Make a directory called targetpath.
2267 """
2268 try:
Christian Heimes2202f872008-02-06 14:31:34 +00002269 # Use a safe mode for the directory, the real mode is set
2270 # later in _extract_member().
2271 os.mkdir(targetpath, 0o700)
Florent Xicluna68f71a32011-10-28 16:06:23 +02002272 except FileExistsError:
2273 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002274
2275 def makefile(self, tarinfo, targetpath):
2276 """Make a file called targetpath.
2277 """
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002278 source = self.fileobj
2279 source.seek(tarinfo.offset_data)
Guido van Rossume7ba4952007-06-06 23:52:48 +00002280 target = bltn_open(targetpath, "wb")
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002281 if tarinfo.sparse is not None:
2282 for offset, size in tarinfo.sparse:
2283 target.seek(offset)
2284 copyfileobj(source, target, size)
2285 else:
2286 copyfileobj(source, target, tarinfo.size)
2287 target.seek(tarinfo.size)
2288 target.truncate()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002289 target.close()
2290
2291 def makeunknown(self, tarinfo, targetpath):
2292 """Make a file from a TarInfo object with an unknown type
2293 at targetpath.
2294 """
2295 self.makefile(tarinfo, targetpath)
2296 self._dbg(1, "tarfile: Unknown file type %r, " \
2297 "extracted as regular file." % tarinfo.type)
2298
2299 def makefifo(self, tarinfo, targetpath):
2300 """Make a fifo called targetpath.
2301 """
2302 if hasattr(os, "mkfifo"):
2303 os.mkfifo(targetpath)
2304 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002305 raise ExtractError("fifo not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002306
2307 def makedev(self, tarinfo, targetpath):
2308 """Make a character or block device called targetpath.
2309 """
2310 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00002311 raise ExtractError("special devices not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002312
2313 mode = tarinfo.mode
2314 if tarinfo.isblk():
2315 mode |= stat.S_IFBLK
2316 else:
2317 mode |= stat.S_IFCHR
2318
2319 os.mknod(targetpath, mode,
2320 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2321
2322 def makelink(self, tarinfo, targetpath):
2323 """Make a (symbolic) link called targetpath. If it cannot be created
2324 (platform limitation), we try to make a copy of the referenced file
2325 instead of a link.
2326 """
Brian Curtind40e6f72010-07-08 21:39:08 +00002327 try:
Lars Gustäbel1b512722010-06-03 12:45:16 +00002328 # For systems that support symbolic and hard links.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002329 if tarinfo.issym():
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002330 os.symlink(tarinfo.linkname, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002331 else:
Neal Norwitza4f651a2004-07-20 22:07:44 +00002332 # See extract().
Lars Gustäbel1b512722010-06-03 12:45:16 +00002333 if os.path.exists(tarinfo._link_target):
2334 os.link(tarinfo._link_target, targetpath)
2335 else:
Brian Curtin82df53e2010-09-24 21:04:05 +00002336 self._extract_member(self._find_link_target(tarinfo),
2337 targetpath)
Brian Curtin16633fa2010-07-09 13:54:27 +00002338 except symlink_exception:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002339 try:
Brian Curtin16633fa2010-07-09 13:54:27 +00002340 self._extract_member(self._find_link_target(tarinfo),
2341 targetpath)
Lars Gustäbel1b512722010-06-03 12:45:16 +00002342 except KeyError:
2343 raise ExtractError("unable to resolve link inside archive")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002344
2345 def chown(self, tarinfo, targetpath):
2346 """Set owner of targetpath according to tarinfo.
2347 """
2348 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2349 # We have to be root to do so.
2350 try:
2351 g = grp.getgrnam(tarinfo.gname)[2]
2352 except KeyError:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002353 g = tarinfo.gid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002354 try:
2355 u = pwd.getpwnam(tarinfo.uname)[2]
2356 except KeyError:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002357 u = tarinfo.uid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002358 try:
2359 if tarinfo.issym() and hasattr(os, "lchown"):
2360 os.lchown(targetpath, u, g)
2361 else:
Andrew MacIntyre7970d202003-02-19 12:51:34 +00002362 if sys.platform != "os2emx":
2363 os.chown(targetpath, u, g)
Guido van Rossumb940e112007-01-10 16:19:56 +00002364 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002365 raise ExtractError("could not change owner")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002366
2367 def chmod(self, tarinfo, targetpath):
2368 """Set file permissions of targetpath according to tarinfo.
2369 """
Jack Jansen834eff62003-03-07 12:47:06 +00002370 if hasattr(os, 'chmod'):
2371 try:
2372 os.chmod(targetpath, tarinfo.mode)
Guido van Rossumb940e112007-01-10 16:19:56 +00002373 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002374 raise ExtractError("could not change mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002375
2376 def utime(self, tarinfo, targetpath):
2377 """Set modification time of targetpath according to tarinfo.
2378 """
Jack Jansen834eff62003-03-07 12:47:06 +00002379 if not hasattr(os, 'utime'):
Tim Petersf9347782003-03-07 15:36:41 +00002380 return
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002381 try:
2382 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
Guido van Rossumb940e112007-01-10 16:19:56 +00002383 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002384 raise ExtractError("could not change modification time")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002385
2386 #--------------------------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002387 def next(self):
2388 """Return the next member of the archive as a TarInfo object, when
2389 TarFile is opened for reading. Return None if there is no more
2390 available.
2391 """
2392 self._check("ra")
2393 if self.firstmember is not None:
2394 m = self.firstmember
2395 self.firstmember = None
2396 return m
2397
2398 # Read the next block.
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002399 self.fileobj.seek(self.offset)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002400 tarinfo = None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002401 while True:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002402 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00002403 tarinfo = self.tarinfo.fromtarfile(self)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002404 except EOFHeaderError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002405 if self.ignore_zeros:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002406 self._dbg(2, "0x%X: %s" % (self.offset, e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002407 self.offset += BLOCKSIZE
2408 continue
Lars Gustäbel9520a432009-11-22 18:48:49 +00002409 except InvalidHeaderError as e:
2410 if self.ignore_zeros:
2411 self._dbg(2, "0x%X: %s" % (self.offset, e))
2412 self.offset += BLOCKSIZE
2413 continue
2414 elif self.offset == 0:
2415 raise ReadError(str(e))
2416 except EmptyHeaderError:
2417 if self.offset == 0:
2418 raise ReadError("empty file")
2419 except TruncatedHeaderError as e:
2420 if self.offset == 0:
2421 raise ReadError(str(e))
2422 except SubsequentHeaderError as e:
2423 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002424 break
2425
Lars Gustäbel9520a432009-11-22 18:48:49 +00002426 if tarinfo is not None:
2427 self.members.append(tarinfo)
2428 else:
2429 self._loaded = True
2430
Thomas Wouters477c8d52006-05-27 19:21:47 +00002431 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002432
2433 #--------------------------------------------------------------------------
2434 # Little helper methods:
2435
Lars Gustäbel1b512722010-06-03 12:45:16 +00002436 def _getmember(self, name, tarinfo=None, normalize=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002437 """Find an archive member by name from bottom to top.
2438 If tarinfo is given, it is used as the starting point.
2439 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002440 # Ensure that all members have been loaded.
2441 members = self.getmembers()
2442
Lars Gustäbel1b512722010-06-03 12:45:16 +00002443 # Limit the member search list up to tarinfo.
2444 if tarinfo is not None:
2445 members = members[:members.index(tarinfo)]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002446
Lars Gustäbel1b512722010-06-03 12:45:16 +00002447 if normalize:
2448 name = os.path.normpath(name)
2449
2450 for member in reversed(members):
2451 if normalize:
2452 member_name = os.path.normpath(member.name)
2453 else:
2454 member_name = member.name
2455
2456 if name == member_name:
2457 return member
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002458
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002459 def _load(self):
2460 """Read through the entire archive file and look for readable
2461 members.
2462 """
2463 while True:
2464 tarinfo = self.next()
2465 if tarinfo is None:
2466 break
2467 self._loaded = True
2468
2469 def _check(self, mode=None):
2470 """Check if TarFile is still open, and if the operation's mode
2471 corresponds to TarFile's mode.
2472 """
2473 if self.closed:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002474 raise IOError("%s is closed" % self.__class__.__name__)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002475 if mode is not None and self.mode not in mode:
2476 raise IOError("bad operation for mode %r" % self.mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002477
Lars Gustäbel1b512722010-06-03 12:45:16 +00002478 def _find_link_target(self, tarinfo):
2479 """Find the target member of a symlink or hardlink member in the
2480 archive.
2481 """
2482 if tarinfo.issym():
2483 # Always search the entire archive.
2484 linkname = os.path.dirname(tarinfo.name) + "/" + tarinfo.linkname
2485 limit = None
2486 else:
2487 # Search the archive before the link, because a hard link is
2488 # just a reference to an already archived file.
2489 linkname = tarinfo.linkname
2490 limit = tarinfo
2491
2492 member = self._getmember(linkname, tarinfo=limit, normalize=True)
2493 if member is None:
2494 raise KeyError("linkname %r not found" % linkname)
2495 return member
2496
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002497 def __iter__(self):
2498 """Provide an iterator object.
2499 """
2500 if self._loaded:
2501 return iter(self.members)
2502 else:
2503 return TarIter(self)
2504
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002505 def _dbg(self, level, msg):
2506 """Write debugging output to sys.stderr.
2507 """
2508 if level <= self.debug:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002509 print(msg, file=sys.stderr)
Lars Gustäbel01385812010-03-03 12:08:54 +00002510
2511 def __enter__(self):
2512 self._check()
2513 return self
2514
2515 def __exit__(self, type, value, traceback):
2516 if type is None:
2517 self.close()
2518 else:
2519 # An exception occurred. We must not call close() because
2520 # it would try to write end-of-archive blocks and padding.
2521 if not self._extfileobj:
2522 self.fileobj.close()
2523 self.closed = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002524# class TarFile
2525
2526class TarIter:
2527 """Iterator Class.
2528
2529 for tarinfo in TarFile(...):
2530 suite...
2531 """
2532
2533 def __init__(self, tarfile):
2534 """Construct a TarIter object.
2535 """
2536 self.tarfile = tarfile
Martin v. Löwis637431b2005-03-03 23:12:42 +00002537 self.index = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002538 def __iter__(self):
2539 """Return iterator object.
2540 """
2541 return self
Georg Brandla18af4e2007-04-21 15:47:16 +00002542 def __next__(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002543 """Return the next item using TarFile's next() method.
2544 When all members have been read, set TarFile as _loaded.
2545 """
Martin v. Löwis637431b2005-03-03 23:12:42 +00002546 # Fix for SF #1100429: Under rare circumstances it can
2547 # happen that getmembers() is called during iteration,
2548 # which will cause TarIter to stop prematurely.
2549 if not self.tarfile._loaded:
2550 tarinfo = self.tarfile.next()
2551 if not tarinfo:
2552 self.tarfile._loaded = True
2553 raise StopIteration
2554 else:
2555 try:
2556 tarinfo = self.tarfile.members[self.index]
2557 except IndexError:
2558 raise StopIteration
2559 self.index += 1
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002560 return tarinfo
2561
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002562#--------------------
2563# exported functions
2564#--------------------
2565def is_tarfile(name):
2566 """Return True if name points to a tar archive that we
2567 are able to handle, else return False.
2568 """
2569 try:
2570 t = open(name)
2571 t.close()
2572 return True
2573 except TarError:
2574 return False
2575
Guido van Rossume7ba4952007-06-06 23:52:48 +00002576bltn_open = open
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002577open = TarFile.open