blob: cc7514d0a63b8d977a7420ea7cf333569d8d2745 [file] [log] [blame]
Benjamin Peterson90f5ba52010-03-11 22:53:45 +00001#!/usr/bin/env python3
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
Christian Heimes9c1257e2007-11-04 11:37:22 +00005# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00006# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
32__version__ = "$Revision$"
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000033
Guido van Rossumd8faa362007-04-27 19:54:29 +000034version = "0.9.0"
Guido van Rossum98297ee2007-11-06 21:34:58 +000035__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)"
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000036__date__ = "$Date$"
37__cvsid__ = "$Id$"
Guido van Rossum98297ee2007-11-06 21:34:58 +000038__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000039
40#---------
41# Imports
42#---------
43import sys
44import os
45import shutil
46import stat
47import errno
48import time
49import struct
Thomas Wouters89f507f2006-12-13 04:49:30 +000050import copy
Guido van Rossumd8faa362007-04-27 19:54:29 +000051import re
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000052
53try:
54 import grp, pwd
55except ImportError:
56 grp = pwd = None
57
Brian Curtin16633fa2010-07-09 13:54:27 +000058# os.symlink on Windows prior to 6.0 raises NotImplementedError
59symlink_exception = (AttributeError, NotImplementedError)
60try:
61 # WindowsError (1314) will be raised if the caller does not hold the
62 # SeCreateSymbolicLinkPrivilege privilege
63 symlink_exception += (WindowsError,)
64except NameError:
65 pass
66
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000067# from tarfile import *
68__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
69
Georg Brandl1a3284e2007-12-02 09:40:06 +000070from builtins import open as _open # Since 'open' is TarFile.open
Guido van Rossum8f78fe92006-08-24 04:03:53 +000071
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000072#---------------------------------------------------------
73# tar constants
74#---------------------------------------------------------
Lars Gustäbelb506dc32007-08-07 18:36:16 +000075NUL = b"\0" # the null character
Guido van Rossumd8faa362007-04-27 19:54:29 +000076BLOCKSIZE = 512 # length of processing blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000077RECORDSIZE = BLOCKSIZE * 20 # length of records
Lars Gustäbelb506dc32007-08-07 18:36:16 +000078GNU_MAGIC = b"ustar \0" # magic gnu tar string
79POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000080
Guido van Rossumd8faa362007-04-27 19:54:29 +000081LENGTH_NAME = 100 # maximum length of a filename
82LENGTH_LINK = 100 # maximum length of a linkname
83LENGTH_PREFIX = 155 # maximum length of the prefix field
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000084
Lars Gustäbelb506dc32007-08-07 18:36:16 +000085REGTYPE = b"0" # regular file
86AREGTYPE = b"\0" # regular file
87LNKTYPE = b"1" # link (inside tarfile)
88SYMTYPE = b"2" # symbolic link
89CHRTYPE = b"3" # character special device
90BLKTYPE = b"4" # block special device
91DIRTYPE = b"5" # directory
92FIFOTYPE = b"6" # fifo special device
93CONTTYPE = b"7" # contiguous file
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000094
Lars Gustäbelb506dc32007-08-07 18:36:16 +000095GNUTYPE_LONGNAME = b"L" # GNU tar longname
96GNUTYPE_LONGLINK = b"K" # GNU tar longlink
97GNUTYPE_SPARSE = b"S" # GNU tar sparse file
Guido van Rossumd8faa362007-04-27 19:54:29 +000098
Lars Gustäbelb506dc32007-08-07 18:36:16 +000099XHDTYPE = b"x" # POSIX.1-2001 extended header
100XGLTYPE = b"g" # POSIX.1-2001 global header
101SOLARIS_XHDTYPE = b"X" # Solaris extended header
Guido van Rossumd8faa362007-04-27 19:54:29 +0000102
103USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
104GNU_FORMAT = 1 # GNU tar format
105PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
106DEFAULT_FORMAT = GNU_FORMAT
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000107
108#---------------------------------------------------------
109# tarfile constants
110#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000111# File types that tarfile supports:
112SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
113 SYMTYPE, DIRTYPE, FIFOTYPE,
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000114 CONTTYPE, CHRTYPE, BLKTYPE,
115 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
116 GNUTYPE_SPARSE)
117
Guido van Rossumd8faa362007-04-27 19:54:29 +0000118# File types that will be treated as a regular file.
119REGULAR_TYPES = (REGTYPE, AREGTYPE,
120 CONTTYPE, GNUTYPE_SPARSE)
121
122# File types that are part of the GNU tar format.
123GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
124 GNUTYPE_SPARSE)
125
126# Fields from a pax header that override a TarInfo attribute.
127PAX_FIELDS = ("path", "linkpath", "size", "mtime",
128 "uid", "gid", "uname", "gname")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000129
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000130# Fields from a pax header that are affected by hdrcharset.
131PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
132
Guido van Rossume7ba4952007-06-06 23:52:48 +0000133# Fields in a pax header that are numbers, all other fields
134# are treated as strings.
135PAX_NUMBER_FIELDS = {
136 "atime": float,
137 "ctime": float,
138 "mtime": float,
139 "uid": int,
140 "gid": int,
141 "size": int
142}
143
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000144#---------------------------------------------------------
145# Bits used in the mode field, values in octal.
146#---------------------------------------------------------
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000147S_IFLNK = 0o120000 # symbolic link
148S_IFREG = 0o100000 # regular file
149S_IFBLK = 0o060000 # block device
150S_IFDIR = 0o040000 # directory
151S_IFCHR = 0o020000 # character device
152S_IFIFO = 0o010000 # fifo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000153
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000154TSUID = 0o4000 # set UID on execution
155TSGID = 0o2000 # set GID on execution
156TSVTX = 0o1000 # reserved
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000157
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000158TUREAD = 0o400 # read by owner
159TUWRITE = 0o200 # write by owner
160TUEXEC = 0o100 # execute/search by owner
161TGREAD = 0o040 # read by group
162TGWRITE = 0o020 # write by group
163TGEXEC = 0o010 # execute/search by group
164TOREAD = 0o004 # read by other
165TOWRITE = 0o002 # write by other
166TOEXEC = 0o001 # execute/search by other
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000167
168#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000169# initialization
170#---------------------------------------------------------
Victor Stinner0f35e2c2010-06-11 23:46:47 +0000171if os.name in ("nt", "ce"):
172 ENCODING = "utf-8"
173else:
174 ENCODING = sys.getfilesystemencoding()
Guido van Rossumd8faa362007-04-27 19:54:29 +0000175
176#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000177# Some useful functions
178#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000179
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000180def stn(s, length, encoding, errors):
181 """Convert a string to a null-terminated bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000182 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000183 s = s.encode(encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000184 return s[:length] + (length - len(s)) * NUL
Thomas Wouters477c8d52006-05-27 19:21:47 +0000185
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000186def nts(s, encoding, errors):
187 """Convert a null-terminated bytes object to a string.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000188 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000189 p = s.find(b"\0")
190 if p != -1:
191 s = s[:p]
192 return s.decode(encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000193
Thomas Wouters477c8d52006-05-27 19:21:47 +0000194def nti(s):
195 """Convert a number field to a python number.
196 """
197 # There are two possible encodings for a number field, see
198 # itn() below.
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000199 if s[0] != chr(0o200):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000200 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000201 n = int(nts(s, "ascii", "strict") or "0", 8)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000202 except ValueError:
Lars Gustäbel9520a432009-11-22 18:48:49 +0000203 raise InvalidHeaderError("invalid header")
Thomas Wouters477c8d52006-05-27 19:21:47 +0000204 else:
Guido van Rossume2a383d2007-01-15 16:59:06 +0000205 n = 0
Guido van Rossum805365e2007-05-07 22:24:25 +0000206 for i in range(len(s) - 1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000207 n <<= 8
208 n += ord(s[i + 1])
209 return n
210
Guido van Rossumd8faa362007-04-27 19:54:29 +0000211def itn(n, digits=8, format=DEFAULT_FORMAT):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000212 """Convert a python number to a number field.
213 """
214 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
215 # octal digits followed by a null-byte, this allows values up to
216 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000217 # that if necessary. A leading 0o200 byte indicates this particular
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218 # encoding, the following digits-1 bytes are a big-endian
219 # representation. This allows values up to (256**(digits-1))-1.
220 if 0 <= n < 8 ** (digits - 1):
Lars Gustäbela280ca752007-08-28 07:34:33 +0000221 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
Thomas Wouters477c8d52006-05-27 19:21:47 +0000222 else:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000223 if format != GNU_FORMAT or n >= 256 ** (digits - 1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224 raise ValueError("overflow in number field")
225
226 if n < 0:
227 # XXX We mimic GNU tar's behaviour with negative numbers,
228 # this could raise OverflowError.
229 n = struct.unpack("L", struct.pack("l", n))[0]
230
Guido van Rossum254348e2007-11-21 19:29:53 +0000231 s = bytearray()
Guido van Rossum805365e2007-05-07 22:24:25 +0000232 for i in range(digits - 1):
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000233 s.insert(0, n & 0o377)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 n >>= 8
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000235 s.insert(0, 0o200)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000236 return s
237
238def calc_chksums(buf):
239 """Calculate the checksum for a member's header by summing up all
240 characters except for the chksum field which is treated as if
241 it was filled with spaces. According to the GNU tar sources,
242 some tars (Sun and NeXT) calculate chksum with signed char,
243 which will be different if there are chars in the buffer with
244 the high bit set. So we calculate two checksums, unsigned and
245 signed.
246 """
247 unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
248 signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
249 return unsigned_chksum, signed_chksum
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000250
251def copyfileobj(src, dst, length=None):
252 """Copy length bytes from fileobj src to fileobj dst.
253 If length is None, copy the entire content.
254 """
255 if length == 0:
256 return
257 if length is None:
258 shutil.copyfileobj(src, dst)
259 return
260
261 BUFSIZE = 16 * 1024
262 blocks, remainder = divmod(length, BUFSIZE)
Guido van Rossum805365e2007-05-07 22:24:25 +0000263 for b in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000264 buf = src.read(BUFSIZE)
265 if len(buf) < BUFSIZE:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000267 dst.write(buf)
268
269 if remainder != 0:
270 buf = src.read(remainder)
271 if len(buf) < remainder:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000273 dst.write(buf)
274 return
275
276filemode_table = (
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000277 ((S_IFLNK, "l"),
278 (S_IFREG, "-"),
279 (S_IFBLK, "b"),
280 (S_IFDIR, "d"),
281 (S_IFCHR, "c"),
282 (S_IFIFO, "p")),
283
284 ((TUREAD, "r"),),
285 ((TUWRITE, "w"),),
286 ((TUEXEC|TSUID, "s"),
287 (TSUID, "S"),
288 (TUEXEC, "x")),
289
290 ((TGREAD, "r"),),
291 ((TGWRITE, "w"),),
292 ((TGEXEC|TSGID, "s"),
293 (TSGID, "S"),
294 (TGEXEC, "x")),
295
296 ((TOREAD, "r"),),
297 ((TOWRITE, "w"),),
298 ((TOEXEC|TSVTX, "t"),
299 (TSVTX, "T"),
300 (TOEXEC, "x"))
301)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000302
303def filemode(mode):
304 """Convert a file's mode to a string of the form
305 -rwxrwxrwx.
306 Used by TarFile.list()
307 """
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000308 perm = []
309 for table in filemode_table:
310 for bit, char in table:
311 if mode & bit == bit:
312 perm.append(char)
313 break
314 else:
315 perm.append("-")
316 return "".join(perm)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000317
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000318class TarError(Exception):
319 """Base exception."""
320 pass
321class ExtractError(TarError):
322 """General exception for extract errors."""
323 pass
324class ReadError(TarError):
325 """Exception for unreadble tar archives."""
326 pass
327class CompressionError(TarError):
328 """Exception for unavailable compression methods."""
329 pass
330class StreamError(TarError):
331 """Exception for unsupported operations on stream-like TarFiles."""
332 pass
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000333class HeaderError(TarError):
Lars Gustäbel9520a432009-11-22 18:48:49 +0000334 """Base exception for header errors."""
335 pass
336class EmptyHeaderError(HeaderError):
337 """Exception for empty headers."""
338 pass
339class TruncatedHeaderError(HeaderError):
340 """Exception for truncated headers."""
341 pass
342class EOFHeaderError(HeaderError):
343 """Exception for end of file headers."""
344 pass
345class InvalidHeaderError(HeaderError):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000346 """Exception for invalid headers."""
347 pass
Lars Gustäbel9520a432009-11-22 18:48:49 +0000348class SubsequentHeaderError(HeaderError):
349 """Exception for missing and invalid extended headers."""
350 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000351
352#---------------------------
353# internal stream interface
354#---------------------------
355class _LowLevelFile:
356 """Low-level file object. Supports reading and writing.
357 It is used instead of a regular file object for streaming
358 access.
359 """
360
361 def __init__(self, name, mode):
362 mode = {
363 "r": os.O_RDONLY,
364 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
365 }[mode]
366 if hasattr(os, "O_BINARY"):
367 mode |= os.O_BINARY
Lars Gustäbeld6eb70b2010-04-29 15:37:02 +0000368 self.fd = os.open(name, mode, 0o666)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000369
370 def close(self):
371 os.close(self.fd)
372
373 def read(self, size):
374 return os.read(self.fd, size)
375
376 def write(self, s):
377 os.write(self.fd, s)
378
379class _Stream:
380 """Class that serves as an adapter between TarFile and
381 a stream-like object. The stream-like object only
382 needs to have a read() or write() method and is accessed
383 blockwise. Use of gzip or bzip2 compression is possible.
384 A stream-like object could be for example: sys.stdin,
385 sys.stdout, a socket, a tape device etc.
386
387 _Stream is intended to be used only internally.
388 """
389
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000390 def __init__(self, name, mode, comptype, fileobj, bufsize):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000391 """Construct a _Stream object.
392 """
393 self._extfileobj = True
394 if fileobj is None:
395 fileobj = _LowLevelFile(name, mode)
396 self._extfileobj = False
397
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000398 if comptype == '*':
399 # Enable transparent compression detection for the
400 # stream interface
401 fileobj = _StreamProxy(fileobj)
402 comptype = fileobj.getcomptype()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000403
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000404 self.name = name or ""
405 self.mode = mode
406 self.comptype = comptype
407 self.fileobj = fileobj
408 self.bufsize = bufsize
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000409 self.buf = b""
Guido van Rossume2a383d2007-01-15 16:59:06 +0000410 self.pos = 0
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000411 self.closed = False
412
Antoine Pitrou605c2932010-09-23 20:15:14 +0000413 try:
414 if comptype == "gz":
415 try:
416 import zlib
417 except ImportError:
418 raise CompressionError("zlib module is not available")
419 self.zlib = zlib
420 self.crc = zlib.crc32(b"")
421 if mode == "r":
422 self._init_read_gz()
423 else:
424 self._init_write_gz()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000425
Antoine Pitrou605c2932010-09-23 20:15:14 +0000426 if comptype == "bz2":
427 try:
428 import bz2
429 except ImportError:
430 raise CompressionError("bz2 module is not available")
431 if mode == "r":
432 self.dbuf = b""
433 self.cmp = bz2.BZ2Decompressor()
434 else:
435 self.cmp = bz2.BZ2Compressor()
436 except:
437 if not self._extfileobj:
438 self.fileobj.close()
439 self.closed = True
440 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000441
442 def __del__(self):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000443 if hasattr(self, "closed") and not self.closed:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000444 self.close()
445
446 def _init_write_gz(self):
447 """Initialize for writing with gzip compression.
448 """
449 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
450 -self.zlib.MAX_WBITS,
451 self.zlib.DEF_MEM_LEVEL,
452 0)
Guido van Rossume2a383d2007-01-15 16:59:06 +0000453 timestamp = struct.pack("<L", int(time.time()))
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000454 self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000455 if self.name.endswith(".gz"):
456 self.name = self.name[:-3]
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000457 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
458 self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000459
460 def write(self, s):
461 """Write string s to the stream.
462 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000463 if self.comptype == "gz":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000464 self.crc = self.zlib.crc32(s, self.crc)
465 self.pos += len(s)
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000466 if self.comptype != "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000467 s = self.cmp.compress(s)
468 self.__write(s)
469
470 def __write(self, s):
471 """Write string s to the stream if a whole new block
472 is ready to be written.
473 """
474 self.buf += s
475 while len(self.buf) > self.bufsize:
476 self.fileobj.write(self.buf[:self.bufsize])
477 self.buf = self.buf[self.bufsize:]
478
479 def close(self):
480 """Close the _Stream object. No operation should be
481 done on it afterwards.
482 """
483 if self.closed:
484 return
485
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000486 if self.mode == "w" and self.comptype != "tar":
Martin v. Löwisc234a522004-08-22 21:28:33 +0000487 self.buf += self.cmp.flush()
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000488
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000489 if self.mode == "w" and self.buf:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000490 self.fileobj.write(self.buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000491 self.buf = b""
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000492 if self.comptype == "gz":
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000493 # The native zlib crc is an unsigned 32-bit integer, but
494 # the Python wrapper implicitly casts that to a signed C
495 # long. So, on a 32-bit box self.crc may "look negative",
496 # while the same crc on a 64-bit box may "look positive".
497 # To avoid irksome warnings from the `struct` module, force
498 # it to look positive on all boxes.
Guido van Rossume2a383d2007-01-15 16:59:06 +0000499 self.fileobj.write(struct.pack("<L", self.crc & 0xffffffff))
500 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000501
502 if not self._extfileobj:
503 self.fileobj.close()
504
505 self.closed = True
506
507 def _init_read_gz(self):
508 """Initialize for reading a gzip compressed fileobj.
509 """
510 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000511 self.dbuf = b""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000512
513 # taken from gzip.GzipFile with some alterations
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000514 if self.__read(2) != b"\037\213":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000515 raise ReadError("not a gzip file")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000516 if self.__read(1) != b"\010":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000517 raise CompressionError("unsupported compression method")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000518
519 flag = ord(self.__read(1))
520 self.__read(6)
521
522 if flag & 4:
523 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
524 self.read(xlen)
525 if flag & 8:
526 while True:
527 s = self.__read(1)
528 if not s or s == NUL:
529 break
530 if flag & 16:
531 while True:
532 s = self.__read(1)
533 if not s or s == NUL:
534 break
535 if flag & 2:
536 self.__read(2)
537
538 def tell(self):
539 """Return the stream's file pointer position.
540 """
541 return self.pos
542
543 def seek(self, pos=0):
544 """Set the stream's file pointer to pos. Negative seeking
545 is forbidden.
546 """
547 if pos - self.pos >= 0:
548 blocks, remainder = divmod(pos - self.pos, self.bufsize)
Guido van Rossum805365e2007-05-07 22:24:25 +0000549 for i in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000550 self.read(self.bufsize)
551 self.read(remainder)
552 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553 raise StreamError("seeking backwards is not allowed")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000554 return self.pos
555
556 def read(self, size=None):
557 """Return the next size number of bytes from the stream.
558 If size is not defined, return all bytes of the stream
559 up to EOF.
560 """
561 if size is None:
562 t = []
563 while True:
564 buf = self._read(self.bufsize)
565 if not buf:
566 break
567 t.append(buf)
568 buf = "".join(t)
569 else:
570 buf = self._read(size)
571 self.pos += len(buf)
572 return buf
573
574 def _read(self, size):
575 """Return size bytes from the stream.
576 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000577 if self.comptype == "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000578 return self.__read(size)
579
580 c = len(self.dbuf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000581 while c < size:
582 buf = self.__read(self.bufsize)
583 if not buf:
584 break
Guido van Rossumd8faa362007-04-27 19:54:29 +0000585 try:
586 buf = self.cmp.decompress(buf)
587 except IOError:
588 raise ReadError("invalid compressed data")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000589 self.dbuf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000590 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000591 buf = self.dbuf[:size]
592 self.dbuf = self.dbuf[size:]
593 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000594
595 def __read(self, size):
596 """Return size bytes from stream. If internal buffer is empty,
597 read another block from the stream.
598 """
599 c = len(self.buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000600 while c < size:
601 buf = self.fileobj.read(self.bufsize)
602 if not buf:
603 break
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000604 self.buf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000605 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000606 buf = self.buf[:size]
607 self.buf = self.buf[size:]
608 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000609# class _Stream
610
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000611class _StreamProxy(object):
612 """Small proxy class that enables transparent compression
613 detection for the Stream interface (mode 'r|*').
614 """
615
616 def __init__(self, fileobj):
617 self.fileobj = fileobj
618 self.buf = self.fileobj.read(BLOCKSIZE)
619
620 def read(self, size):
621 self.read = self.fileobj.read
622 return self.buf
623
624 def getcomptype(self):
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000625 if self.buf.startswith(b"\037\213\010"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000626 return "gz"
Lars Gustäbela280ca752007-08-28 07:34:33 +0000627 if self.buf.startswith(b"BZh91"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000628 return "bz2"
629 return "tar"
630
631 def close(self):
632 self.fileobj.close()
633# class StreamProxy
634
Thomas Wouters477c8d52006-05-27 19:21:47 +0000635class _BZ2Proxy(object):
636 """Small proxy class that enables external file object
637 support for "r:bz2" and "w:bz2" modes. This is actually
638 a workaround for a limitation in bz2 module's BZ2File
639 class which (unlike gzip.GzipFile) has no support for
640 a file object argument.
641 """
642
643 blocksize = 16 * 1024
644
645 def __init__(self, fileobj, mode):
646 self.fileobj = fileobj
647 self.mode = mode
Guido van Rossumd8faa362007-04-27 19:54:29 +0000648 self.name = getattr(self.fileobj, "name", None)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000649 self.init()
650
651 def init(self):
652 import bz2
653 self.pos = 0
654 if self.mode == "r":
655 self.bz2obj = bz2.BZ2Decompressor()
656 self.fileobj.seek(0)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000657 self.buf = b""
Thomas Wouters477c8d52006-05-27 19:21:47 +0000658 else:
659 self.bz2obj = bz2.BZ2Compressor()
660
661 def read(self, size):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000662 x = len(self.buf)
663 while x < size:
Lars Gustäbel42e00912009-03-22 20:34:29 +0000664 raw = self.fileobj.read(self.blocksize)
665 if not raw:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000666 break
Lars Gustäbel42e00912009-03-22 20:34:29 +0000667 data = self.bz2obj.decompress(raw)
668 self.buf += data
Thomas Wouters477c8d52006-05-27 19:21:47 +0000669 x += len(data)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000670
671 buf = self.buf[:size]
672 self.buf = self.buf[size:]
673 self.pos += len(buf)
674 return buf
675
676 def seek(self, pos):
677 if pos < self.pos:
678 self.init()
679 self.read(pos - self.pos)
680
681 def tell(self):
682 return self.pos
683
684 def write(self, data):
685 self.pos += len(data)
686 raw = self.bz2obj.compress(data)
687 self.fileobj.write(raw)
688
689 def close(self):
690 if self.mode == "w":
691 raw = self.bz2obj.flush()
692 self.fileobj.write(raw)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000693# class _BZ2Proxy
694
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000695#------------------------
696# Extraction file object
697#------------------------
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000698class _FileInFile(object):
699 """A thin wrapper around an existing file object that
700 provides a part of its data as an individual file
701 object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000702 """
703
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000704 def __init__(self, fileobj, offset, size, sparse=None):
705 self.fileobj = fileobj
706 self.offset = offset
707 self.size = size
708 self.sparse = sparse
709 self.position = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000710
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000711 def seekable(self):
712 if not hasattr(self.fileobj, "seekable"):
713 # XXX gzip.GzipFile and bz2.BZ2File
714 return True
715 return self.fileobj.seekable()
716
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000717 def tell(self):
718 """Return the current file position.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000719 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000720 return self.position
721
722 def seek(self, position):
723 """Seek to a position in the file.
724 """
725 self.position = position
726
727 def read(self, size=None):
728 """Read data from the file.
729 """
730 if size is None:
731 size = self.size - self.position
732 else:
733 size = min(size, self.size - self.position)
734
735 if self.sparse is None:
736 return self.readnormal(size)
737 else:
738 return self.readsparse(size)
739
740 def readnormal(self, size):
741 """Read operation for regular files.
742 """
743 self.fileobj.seek(self.offset + self.position)
744 self.position += size
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000745 return self.fileobj.read(size)
746
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000747 def readsparse(self, size):
748 """Read operation for sparse files.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000749 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000750 data = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000751 while size > 0:
752 buf = self.readsparsesection(size)
753 if not buf:
754 break
755 size -= len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000756 data += buf
757 return data
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000758
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000759 def readsparsesection(self, size):
760 """Read a single section of a sparse file.
761 """
762 section = self.sparse.find(self.position)
763
764 if section is None:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000765 return b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000766
767 size = min(size, section.offset + section.size - self.position)
768
769 if isinstance(section, _data):
770 realpos = section.realpos + self.position - section.offset
771 self.fileobj.seek(self.offset + realpos)
772 self.position += size
773 return self.fileobj.read(size)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000774 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000775 self.position += size
776 return NUL * size
777#class _FileInFile
778
779
780class ExFileObject(object):
781 """File-like object for reading an archive member.
782 Is returned by TarFile.extractfile().
783 """
784 blocksize = 1024
785
786 def __init__(self, tarfile, tarinfo):
787 self.fileobj = _FileInFile(tarfile.fileobj,
788 tarinfo.offset_data,
789 tarinfo.size,
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000790 tarinfo.sparse)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000791 self.name = tarinfo.name
792 self.mode = "r"
793 self.closed = False
794 self.size = tarinfo.size
795
796 self.position = 0
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000797 self.buffer = b""
798
799 def readable(self):
800 return True
801
802 def writable(self):
803 return False
804
805 def seekable(self):
806 return self.fileobj.seekable()
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000807
808 def read(self, size=None):
809 """Read at most size bytes from the file. If size is not
810 present or None, read all data until EOF is reached.
811 """
812 if self.closed:
813 raise ValueError("I/O operation on closed file")
814
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000815 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000816 if self.buffer:
817 if size is None:
818 buf = self.buffer
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000819 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000820 else:
821 buf = self.buffer[:size]
822 self.buffer = self.buffer[size:]
823
824 if size is None:
825 buf += self.fileobj.read()
826 else:
827 buf += self.fileobj.read(size - len(buf))
828
829 self.position += len(buf)
830 return buf
831
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000832 # XXX TextIOWrapper uses the read1() method.
833 read1 = read
834
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000835 def readline(self, size=-1):
836 """Read one entire line from the file. If size is present
837 and non-negative, return a string with at most that
838 size, which may be an incomplete line.
839 """
840 if self.closed:
841 raise ValueError("I/O operation on closed file")
842
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000843 pos = self.buffer.find(b"\n") + 1
844 if pos == 0:
845 # no newline found.
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000846 while True:
847 buf = self.fileobj.read(self.blocksize)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000848 self.buffer += buf
849 if not buf or b"\n" in buf:
850 pos = self.buffer.find(b"\n") + 1
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000851 if pos == 0:
852 # no newline found.
853 pos = len(self.buffer)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000854 break
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000855
856 if size != -1:
857 pos = min(size, pos)
858
859 buf = self.buffer[:pos]
860 self.buffer = self.buffer[pos:]
861 self.position += len(buf)
862 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000863
864 def readlines(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000865 """Return a list with all remaining lines.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000866 """
867 result = []
868 while True:
869 line = self.readline()
870 if not line: break
871 result.append(line)
872 return result
873
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000874 def tell(self):
875 """Return the current file position.
876 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000877 if self.closed:
878 raise ValueError("I/O operation on closed file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000879
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000880 return self.position
881
882 def seek(self, pos, whence=os.SEEK_SET):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000883 """Seek to a position in the file.
884 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000885 if self.closed:
886 raise ValueError("I/O operation on closed file")
887
888 if whence == os.SEEK_SET:
889 self.position = min(max(pos, 0), self.size)
890 elif whence == os.SEEK_CUR:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000891 if pos < 0:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000892 self.position = max(self.position + pos, 0)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000893 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000894 self.position = min(self.position + pos, self.size)
895 elif whence == os.SEEK_END:
896 self.position = max(min(self.size + pos, self.size), 0)
897 else:
898 raise ValueError("Invalid argument")
899
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000900 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000901 self.fileobj.seek(self.position)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000902
903 def close(self):
904 """Close the file object.
905 """
906 self.closed = True
Martin v. Löwisdf241532005-03-03 08:17:42 +0000907
908 def __iter__(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000909 """Get an iterator over the file's lines.
Martin v. Löwisdf241532005-03-03 08:17:42 +0000910 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000911 while True:
912 line = self.readline()
913 if not line:
914 break
915 yield line
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000916#class ExFileObject
917
918#------------------
919# Exported Classes
920#------------------
921class TarInfo(object):
922 """Informational class which holds the details about an
923 archive member given by a tar header block.
924 TarInfo objects are returned by TarFile.getmember(),
925 TarFile.getmembers() and TarFile.gettarinfo() and are
926 usually created internally.
927 """
928
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000929 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
930 "chksum", "type", "linkname", "uname", "gname",
931 "devmajor", "devminor",
932 "offset", "offset_data", "pax_headers", "sparse",
933 "tarfile", "_sparse_structs", "_link_target")
934
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000935 def __init__(self, name=""):
936 """Construct a TarInfo object. name is the optional name
937 of the member.
938 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000939 self.name = name # member name
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000940 self.mode = 0o644 # file permissions
Thomas Wouters477c8d52006-05-27 19:21:47 +0000941 self.uid = 0 # user id
942 self.gid = 0 # group id
943 self.size = 0 # file size
944 self.mtime = 0 # modification time
945 self.chksum = 0 # header checksum
946 self.type = REGTYPE # member type
947 self.linkname = "" # link name
Lars Gustäbel331b8002010-10-04 15:18:47 +0000948 self.uname = "" # user name
949 self.gname = "" # group name
Thomas Wouters477c8d52006-05-27 19:21:47 +0000950 self.devmajor = 0 # device major number
951 self.devminor = 0 # device minor number
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000952
Thomas Wouters477c8d52006-05-27 19:21:47 +0000953 self.offset = 0 # the tar header starts here
954 self.offset_data = 0 # the file's data starts here
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000955
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000956 self.sparse = None # sparse member information
Guido van Rossumd8faa362007-04-27 19:54:29 +0000957 self.pax_headers = {} # pax header information
958
959 # In pax headers the "name" and "linkname" field are called
960 # "path" and "linkpath".
961 def _getpath(self):
962 return self.name
963 def _setpath(self, name):
964 self.name = name
965 path = property(_getpath, _setpath)
966
967 def _getlinkpath(self):
968 return self.linkname
969 def _setlinkpath(self, linkname):
970 self.linkname = linkname
971 linkpath = property(_getlinkpath, _setlinkpath)
972
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000973 def __repr__(self):
974 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
975
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000976 def get_info(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000977 """Return the TarInfo's attributes as a dictionary.
978 """
979 info = {
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000980 "name": self.name,
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000981 "mode": self.mode & 0o7777,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000982 "uid": self.uid,
983 "gid": self.gid,
984 "size": self.size,
985 "mtime": self.mtime,
986 "chksum": self.chksum,
987 "type": self.type,
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000988 "linkname": self.linkname,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000989 "uname": self.uname,
990 "gname": self.gname,
991 "devmajor": self.devmajor,
992 "devminor": self.devminor
993 }
994
995 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
996 info["name"] += "/"
997
998 return info
999
Victor Stinnerde629d42010-05-05 21:43:57 +00001000 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001001 """Return a tar header as a string of 512 byte blocks.
1002 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001003 info = self.get_info()
Guido van Rossume7ba4952007-06-06 23:52:48 +00001004
Guido van Rossumd8faa362007-04-27 19:54:29 +00001005 if format == USTAR_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001006 return self.create_ustar_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001007 elif format == GNU_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001008 return self.create_gnu_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001009 elif format == PAX_FORMAT:
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001010 return self.create_pax_header(info, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001011 else:
1012 raise ValueError("invalid format")
1013
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001014 def create_ustar_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001015 """Return the object as a ustar header block.
1016 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001017 info["magic"] = POSIX_MAGIC
1018
1019 if len(info["linkname"]) > LENGTH_LINK:
1020 raise ValueError("linkname is too long")
1021
1022 if len(info["name"]) > LENGTH_NAME:
1023 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1024
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001025 return self._create_header(info, USTAR_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001026
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001027 def create_gnu_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001028 """Return the object as a GNU header block sequence.
1029 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001030 info["magic"] = GNU_MAGIC
1031
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001032 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +00001033 if len(info["linkname"]) > LENGTH_LINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001034 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001035
1036 if len(info["name"]) > LENGTH_NAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001037 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001038
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001039 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001040
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001041 def create_pax_header(self, info, encoding):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001042 """Return the object as a ustar header block. If it cannot be
1043 represented this way, prepend a pax extended header sequence
1044 with supplement information.
1045 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001046 info["magic"] = POSIX_MAGIC
1047 pax_headers = self.pax_headers.copy()
1048
1049 # Test string fields for values that exceed the field length or cannot
1050 # be represented in ASCII encoding.
1051 for name, hname, length in (
1052 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
1053 ("uname", "uname", 32), ("gname", "gname", 32)):
1054
Guido van Rossume7ba4952007-06-06 23:52:48 +00001055 if hname in pax_headers:
1056 # The pax header has priority.
1057 continue
1058
Guido van Rossumd8faa362007-04-27 19:54:29 +00001059 # Try to encode the string as ASCII.
1060 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001061 info[name].encode("ascii", "strict")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001062 except UnicodeEncodeError:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001063 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001064 continue
1065
Guido van Rossume7ba4952007-06-06 23:52:48 +00001066 if len(info[name]) > length:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001067 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001068
1069 # Test number fields for values that exceed the field limit or values
1070 # that like to be stored as float.
1071 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001072 if name in pax_headers:
1073 # The pax header has priority. Avoid overflow.
1074 info[name] = 0
1075 continue
1076
Guido van Rossumd8faa362007-04-27 19:54:29 +00001077 val = info[name]
1078 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001079 pax_headers[name] = str(val)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001080 info[name] = 0
1081
Guido van Rossume7ba4952007-06-06 23:52:48 +00001082 # Create a pax extended header if necessary.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001083 if pax_headers:
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001084 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001085 else:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001086 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +00001087
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001088 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001089
1090 @classmethod
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001091 def create_pax_global_header(cls, pax_headers):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001092 """Return the object as a pax global header block sequence.
1093 """
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001094 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf8")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001095
1096 def _posix_split_name(self, name):
1097 """Split a name longer than 100 chars into a prefix
1098 and a name part.
1099 """
1100 prefix = name[:LENGTH_PREFIX + 1]
1101 while prefix and prefix[-1] != "/":
1102 prefix = prefix[:-1]
1103
1104 name = name[len(prefix):]
1105 prefix = prefix[:-1]
1106
1107 if not prefix or len(name) > LENGTH_NAME:
1108 raise ValueError("name is too long")
1109 return prefix, name
1110
1111 @staticmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001112 def _create_header(info, format, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001113 """Return a header block. info is a dictionary with file
1114 information, format must be one of the *_FORMAT constants.
1115 """
1116 parts = [
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001117 stn(info.get("name", ""), 100, encoding, errors),
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001118 itn(info.get("mode", 0) & 0o7777, 8, format),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001119 itn(info.get("uid", 0), 8, format),
1120 itn(info.get("gid", 0), 8, format),
1121 itn(info.get("size", 0), 12, format),
1122 itn(info.get("mtime", 0), 12, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001123 b" ", # checksum field
Guido van Rossumd8faa362007-04-27 19:54:29 +00001124 info.get("type", REGTYPE),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001125 stn(info.get("linkname", ""), 100, encoding, errors),
1126 info.get("magic", POSIX_MAGIC),
Lars Gustäbel331b8002010-10-04 15:18:47 +00001127 stn(info.get("uname", ""), 32, encoding, errors),
1128 stn(info.get("gname", ""), 32, encoding, errors),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001129 itn(info.get("devmajor", 0), 8, format),
1130 itn(info.get("devminor", 0), 8, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001131 stn(info.get("prefix", ""), 155, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001132 ]
1133
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001134 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001135 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
Lars Gustäbela280ca752007-08-28 07:34:33 +00001136 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001137 return buf
1138
1139 @staticmethod
1140 def _create_payload(payload):
1141 """Return the string payload filled with zero bytes
1142 up to the next 512 byte border.
1143 """
1144 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1145 if remainder > 0:
1146 payload += (BLOCKSIZE - remainder) * NUL
1147 return payload
1148
1149 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001150 def _create_gnu_long_header(cls, name, type, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001151 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1152 for name.
1153 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001154 name = name.encode(encoding, errors) + NUL
Guido van Rossumd8faa362007-04-27 19:54:29 +00001155
1156 info = {}
1157 info["name"] = "././@LongLink"
1158 info["type"] = type
1159 info["size"] = len(name)
1160 info["magic"] = GNU_MAGIC
1161
1162 # create extended header + name blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001163 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001164 cls._create_payload(name)
1165
1166 @classmethod
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001167 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1168 """Return a POSIX.1-2008 extended or global header sequence
Guido van Rossumd8faa362007-04-27 19:54:29 +00001169 that contains a list of keyword, value pairs. The values
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001170 must be strings.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001171 """
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001172 # Check if one of the fields contains surrogate characters and thereby
1173 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1174 binary = False
1175 for keyword, value in pax_headers.items():
1176 try:
1177 value.encode("utf8", "strict")
1178 except UnicodeEncodeError:
1179 binary = True
1180 break
1181
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001182 records = b""
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001183 if binary:
1184 # Put the hdrcharset field at the beginning of the header.
1185 records += b"21 hdrcharset=BINARY\n"
1186
Guido van Rossumd8faa362007-04-27 19:54:29 +00001187 for keyword, value in pax_headers.items():
1188 keyword = keyword.encode("utf8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001189 if binary:
1190 # Try to restore the original byte representation of `value'.
1191 # Needless to say, that the encoding must match the string.
1192 value = value.encode(encoding, "surrogateescape")
1193 else:
1194 value = value.encode("utf8")
1195
Guido van Rossumd8faa362007-04-27 19:54:29 +00001196 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1197 n = p = 0
1198 while True:
1199 n = l + len(str(p))
1200 if n == p:
1201 break
1202 p = n
Lars Gustäbela280ca752007-08-28 07:34:33 +00001203 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
Guido van Rossumd8faa362007-04-27 19:54:29 +00001204
1205 # We use a hardcoded "././@PaxHeader" name like star does
1206 # instead of the one that POSIX recommends.
1207 info = {}
1208 info["name"] = "././@PaxHeader"
1209 info["type"] = type
1210 info["size"] = len(records)
1211 info["magic"] = POSIX_MAGIC
1212
1213 # Create pax header + record blocks.
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001214 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001215 cls._create_payload(records)
1216
Guido van Rossum75b64e62005-01-16 00:16:11 +00001217 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001218 def frombuf(cls, buf, encoding, errors):
1219 """Construct a TarInfo object from a 512 byte bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001220 """
Lars Gustäbel9520a432009-11-22 18:48:49 +00001221 if len(buf) == 0:
1222 raise EmptyHeaderError("empty header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001223 if len(buf) != BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001224 raise TruncatedHeaderError("truncated header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001225 if buf.count(NUL) == BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001226 raise EOFHeaderError("end of file header")
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001227
1228 chksum = nti(buf[148:156])
1229 if chksum not in calc_chksums(buf):
Lars Gustäbel9520a432009-11-22 18:48:49 +00001230 raise InvalidHeaderError("bad checksum")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001231
Guido van Rossumd8faa362007-04-27 19:54:29 +00001232 obj = cls()
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001233 obj.name = nts(buf[0:100], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001234 obj.mode = nti(buf[100:108])
1235 obj.uid = nti(buf[108:116])
1236 obj.gid = nti(buf[116:124])
1237 obj.size = nti(buf[124:136])
1238 obj.mtime = nti(buf[136:148])
1239 obj.chksum = chksum
1240 obj.type = buf[156:157]
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001241 obj.linkname = nts(buf[157:257], encoding, errors)
1242 obj.uname = nts(buf[265:297], encoding, errors)
1243 obj.gname = nts(buf[297:329], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001244 obj.devmajor = nti(buf[329:337])
1245 obj.devminor = nti(buf[337:345])
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001246 prefix = nts(buf[345:500], encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001247
Guido van Rossumd8faa362007-04-27 19:54:29 +00001248 # Old V7 tar format represents a directory as a regular
1249 # file with a trailing slash.
1250 if obj.type == AREGTYPE and obj.name.endswith("/"):
1251 obj.type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001252
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001253 # The old GNU sparse format occupies some of the unused
1254 # space in the buffer for up to 4 sparse structures.
1255 # Save the them for later processing in _proc_sparse().
1256 if obj.type == GNUTYPE_SPARSE:
1257 pos = 386
1258 structs = []
1259 for i in range(4):
1260 try:
1261 offset = nti(buf[pos:pos + 12])
1262 numbytes = nti(buf[pos + 12:pos + 24])
1263 except ValueError:
1264 break
1265 structs.append((offset, numbytes))
1266 pos += 24
1267 isextended = bool(buf[482])
1268 origsize = nti(buf[483:495])
1269 obj._sparse_structs = (structs, isextended, origsize)
1270
Guido van Rossumd8faa362007-04-27 19:54:29 +00001271 # Remove redundant slashes from directories.
1272 if obj.isdir():
1273 obj.name = obj.name.rstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001274
Guido van Rossumd8faa362007-04-27 19:54:29 +00001275 # Reconstruct a ustar longname.
1276 if prefix and obj.type not in GNU_TYPES:
1277 obj.name = prefix + "/" + obj.name
1278 return obj
1279
1280 @classmethod
1281 def fromtarfile(cls, tarfile):
1282 """Return the next TarInfo object from TarFile object
1283 tarfile.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001284 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001285 buf = tarfile.fileobj.read(BLOCKSIZE)
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001286 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001287 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1288 return obj._proc_member(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001289
Guido van Rossumd8faa362007-04-27 19:54:29 +00001290 #--------------------------------------------------------------------------
1291 # The following are methods that are called depending on the type of a
1292 # member. The entry point is _proc_member() which can be overridden in a
1293 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1294 # implement the following
1295 # operations:
1296 # 1. Set self.offset_data to the position where the data blocks begin,
1297 # if there is data that follows.
1298 # 2. Set tarfile.offset to the position where the next member's header will
1299 # begin.
1300 # 3. Return self or another valid TarInfo object.
1301 def _proc_member(self, tarfile):
1302 """Choose the right processing method depending on
1303 the type and call it.
Thomas Wouters89f507f2006-12-13 04:49:30 +00001304 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001305 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1306 return self._proc_gnulong(tarfile)
1307 elif self.type == GNUTYPE_SPARSE:
1308 return self._proc_sparse(tarfile)
1309 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1310 return self._proc_pax(tarfile)
1311 else:
1312 return self._proc_builtin(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001313
Guido van Rossumd8faa362007-04-27 19:54:29 +00001314 def _proc_builtin(self, tarfile):
1315 """Process a builtin type or an unknown type which
1316 will be treated as a regular file.
1317 """
1318 self.offset_data = tarfile.fileobj.tell()
1319 offset = self.offset_data
1320 if self.isreg() or self.type not in SUPPORTED_TYPES:
1321 # Skip the following data blocks.
1322 offset += self._block(self.size)
1323 tarfile.offset = offset
Thomas Wouters89f507f2006-12-13 04:49:30 +00001324
Guido van Rossume7ba4952007-06-06 23:52:48 +00001325 # Patch the TarInfo object with saved global
Guido van Rossumd8faa362007-04-27 19:54:29 +00001326 # header information.
Guido van Rossume7ba4952007-06-06 23:52:48 +00001327 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001328
1329 return self
1330
1331 def _proc_gnulong(self, tarfile):
1332 """Process the blocks that hold a GNU longname
1333 or longlink member.
1334 """
1335 buf = tarfile.fileobj.read(self._block(self.size))
1336
1337 # Fetch the next header and process it.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001338 try:
1339 next = self.fromtarfile(tarfile)
1340 except HeaderError:
1341 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001342
1343 # Patch the TarInfo object from the next header with
1344 # the longname information.
1345 next.offset = self.offset
1346 if self.type == GNUTYPE_LONGNAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001347 next.name = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001348 elif self.type == GNUTYPE_LONGLINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001349 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001350
1351 return next
1352
1353 def _proc_sparse(self, tarfile):
1354 """Process a GNU sparse header plus extra headers.
1355 """
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001356 # We already collected some sparse structures in frombuf().
1357 structs, isextended, origsize = self._sparse_structs
1358 del self._sparse_structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001359
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001360 # Collect sparse structures from extended header blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001361 while isextended:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001362 buf = tarfile.fileobj.read(BLOCKSIZE)
1363 pos = 0
Guido van Rossum805365e2007-05-07 22:24:25 +00001364 for i in range(21):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001365 try:
1366 offset = nti(buf[pos:pos + 12])
1367 numbytes = nti(buf[pos + 12:pos + 24])
1368 except ValueError:
1369 break
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001370 structs.append((offset, numbytes))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001371 pos += 24
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001372 isextended = bool(buf[504])
Guido van Rossumd8faa362007-04-27 19:54:29 +00001373
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001374 # Transform the sparse structures to something we can use
1375 # in ExFileObject.
1376 self.sparse = _ringbuffer()
1377 lastpos = 0
1378 realpos = 0
1379 for offset, numbytes in structs:
1380 if offset > lastpos:
1381 self.sparse.append(_hole(lastpos, offset - lastpos))
1382 self.sparse.append(_data(offset, numbytes, realpos))
1383 realpos += numbytes
1384 lastpos = offset + numbytes
Guido van Rossumd8faa362007-04-27 19:54:29 +00001385 if lastpos < origsize:
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001386 self.sparse.append(_hole(lastpos, origsize - lastpos))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001387
1388 self.offset_data = tarfile.fileobj.tell()
1389 tarfile.offset = self.offset_data + self._block(self.size)
1390 self.size = origsize
1391
1392 return self
1393
1394 def _proc_pax(self, tarfile):
1395 """Process an extended or global header as described in
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001396 POSIX.1-2008.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001397 """
1398 # Read the header information.
1399 buf = tarfile.fileobj.read(self._block(self.size))
1400
1401 # A pax header stores supplemental information for either
1402 # the following file (extended) or all following files
1403 # (global).
1404 if self.type == XGLTYPE:
1405 pax_headers = tarfile.pax_headers
1406 else:
1407 pax_headers = tarfile.pax_headers.copy()
1408
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001409 # Check if the pax header contains a hdrcharset field. This tells us
1410 # the encoding of the path, linkpath, uname and gname fields. Normally,
1411 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1412 # implementations are allowed to store them as raw binary strings if
1413 # the translation to UTF-8 fails.
1414 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1415 if match is not None:
1416 pax_headers["hdrcharset"] = match.group(1).decode("utf8")
1417
1418 # For the time being, we don't care about anything other than "BINARY".
1419 # The only other value that is currently allowed by the standard is
1420 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1421 hdrcharset = pax_headers.get("hdrcharset")
1422 if hdrcharset == "BINARY":
1423 encoding = tarfile.encoding
1424 else:
1425 encoding = "utf8"
1426
Guido van Rossumd8faa362007-04-27 19:54:29 +00001427 # Parse pax header information. A record looks like that:
1428 # "%d %s=%s\n" % (length, keyword, value). length is the size
1429 # of the complete record including the length field itself and
Guido van Rossume7ba4952007-06-06 23:52:48 +00001430 # the newline. keyword and value are both UTF-8 encoded strings.
Antoine Pitroufd036452008-08-19 17:56:33 +00001431 regex = re.compile(br"(\d+) ([^=]+)=")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001432 pos = 0
1433 while True:
1434 match = regex.match(buf, pos)
1435 if not match:
1436 break
1437
1438 length, keyword = match.groups()
1439 length = int(length)
1440 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1441
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001442 # Normally, we could just use "utf8" as the encoding and "strict"
1443 # as the error handler, but we better not take the risk. For
1444 # example, GNU tar <= 1.23 is known to store filenames it cannot
1445 # translate to UTF-8 as raw strings (unfortunately without a
1446 # hdrcharset=BINARY header).
1447 # We first try the strict standard encoding, and if that fails we
1448 # fall back on the user's encoding and error handler.
1449 keyword = self._decode_pax_field(keyword, "utf8", "utf8",
1450 tarfile.errors)
1451 if keyword in PAX_NAME_FIELDS:
1452 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1453 tarfile.errors)
1454 else:
1455 value = self._decode_pax_field(value, "utf8", "utf8",
1456 tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001457
1458 pax_headers[keyword] = value
1459 pos += length
1460
Guido van Rossume7ba4952007-06-06 23:52:48 +00001461 # Fetch the next header.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001462 try:
1463 next = self.fromtarfile(tarfile)
1464 except HeaderError:
1465 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001466
Guido van Rossume7ba4952007-06-06 23:52:48 +00001467 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001468 # Patch the TarInfo object with the extended header info.
1469 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1470 next.offset = self.offset
1471
1472 if "size" in pax_headers:
1473 # If the extended header replaces the size field,
1474 # we need to recalculate the offset where the next
1475 # header starts.
1476 offset = next.offset_data
1477 if next.isreg() or next.type not in SUPPORTED_TYPES:
1478 offset += next._block(next.size)
1479 tarfile.offset = offset
1480
1481 return next
1482
1483 def _apply_pax_info(self, pax_headers, encoding, errors):
1484 """Replace fields with supplemental information from a previous
1485 pax extended or global header.
1486 """
1487 for keyword, value in pax_headers.items():
1488 if keyword not in PAX_FIELDS:
1489 continue
1490
1491 if keyword == "path":
1492 value = value.rstrip("/")
1493
1494 if keyword in PAX_NUMBER_FIELDS:
1495 try:
1496 value = PAX_NUMBER_FIELDS[keyword](value)
1497 except ValueError:
1498 value = 0
Guido van Rossume7ba4952007-06-06 23:52:48 +00001499
1500 setattr(self, keyword, value)
1501
1502 self.pax_headers = pax_headers.copy()
Guido van Rossumd8faa362007-04-27 19:54:29 +00001503
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001504 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1505 """Decode a single field from a pax record.
1506 """
1507 try:
1508 return value.decode(encoding, "strict")
1509 except UnicodeDecodeError:
1510 return value.decode(fallback_encoding, fallback_errors)
1511
Guido van Rossumd8faa362007-04-27 19:54:29 +00001512 def _block(self, count):
1513 """Round up a byte count by BLOCKSIZE and return it,
1514 e.g. _block(834) => 1024.
1515 """
1516 blocks, remainder = divmod(count, BLOCKSIZE)
1517 if remainder:
1518 blocks += 1
1519 return blocks * BLOCKSIZE
Thomas Wouters89f507f2006-12-13 04:49:30 +00001520
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001521 def isreg(self):
1522 return self.type in REGULAR_TYPES
1523 def isfile(self):
1524 return self.isreg()
1525 def isdir(self):
1526 return self.type == DIRTYPE
1527 def issym(self):
1528 return self.type == SYMTYPE
1529 def islnk(self):
1530 return self.type == LNKTYPE
1531 def ischr(self):
1532 return self.type == CHRTYPE
1533 def isblk(self):
1534 return self.type == BLKTYPE
1535 def isfifo(self):
1536 return self.type == FIFOTYPE
1537 def issparse(self):
1538 return self.type == GNUTYPE_SPARSE
1539 def isdev(self):
1540 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1541# class TarInfo
1542
1543class TarFile(object):
1544 """The TarFile Class provides an interface to tar archives.
1545 """
1546
1547 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1548
1549 dereference = False # If true, add content of linked file to the
1550 # tar file, else the link.
1551
1552 ignore_zeros = False # If true, skips empty or invalid blocks and
1553 # continues processing.
1554
Lars Gustäbel365aff32009-12-13 11:42:29 +00001555 errorlevel = 1 # If 0, fatal errors only appear in debug
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001556 # messages (if debug >= 0). If > 0, errors
1557 # are passed to the caller as exceptions.
1558
Guido van Rossumd8faa362007-04-27 19:54:29 +00001559 format = DEFAULT_FORMAT # The format to use when creating an archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001560
Guido van Rossume7ba4952007-06-06 23:52:48 +00001561 encoding = ENCODING # Encoding for 8-bit character strings.
1562
1563 errors = None # Error handler for unicode conversion.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001564
Guido van Rossumd8faa362007-04-27 19:54:29 +00001565 tarinfo = TarInfo # The default TarInfo class to use.
1566
1567 fileobject = ExFileObject # The default ExFileObject class to use.
1568
1569 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1570 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
Victor Stinnerde629d42010-05-05 21:43:57 +00001571 errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001572 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1573 read from an existing archive, 'a' to append data to an existing
1574 file or 'w' to create a new file overwriting an existing one. `mode'
1575 defaults to 'r'.
1576 If `fileobj' is given, it is used for reading or writing data. If it
1577 can be determined, `mode' is overridden by `fileobj's mode.
1578 `fileobj' is not closed, when TarFile is closed.
1579 """
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001580 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001581 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001582 self.mode = mode
1583 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001584
1585 if not fileobj:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001586 if self.mode == "a" and not os.path.exists(name):
Thomas Wouterscf297e42007-02-23 15:07:44 +00001587 # Create nonexistent files in append mode.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001588 self.mode = "w"
1589 self._mode = "wb"
Guido van Rossume7ba4952007-06-06 23:52:48 +00001590 fileobj = bltn_open(name, self._mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001591 self._extfileobj = False
1592 else:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001593 if name is None and hasattr(fileobj, "name"):
1594 name = fileobj.name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001595 if hasattr(fileobj, "mode"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001596 self._mode = fileobj.mode
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001597 self._extfileobj = True
Thomas Woutersed03b412007-08-28 21:37:11 +00001598 self.name = os.path.abspath(name) if name else None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001599 self.fileobj = fileobj
1600
Guido van Rossumd8faa362007-04-27 19:54:29 +00001601 # Init attributes.
1602 if format is not None:
1603 self.format = format
1604 if tarinfo is not None:
1605 self.tarinfo = tarinfo
1606 if dereference is not None:
1607 self.dereference = dereference
1608 if ignore_zeros is not None:
1609 self.ignore_zeros = ignore_zeros
1610 if encoding is not None:
1611 self.encoding = encoding
Victor Stinnerde629d42010-05-05 21:43:57 +00001612 self.errors = errors
Guido van Rossume7ba4952007-06-06 23:52:48 +00001613
1614 if pax_headers is not None and self.format == PAX_FORMAT:
1615 self.pax_headers = pax_headers
1616 else:
1617 self.pax_headers = {}
1618
Guido van Rossumd8faa362007-04-27 19:54:29 +00001619 if debug is not None:
1620 self.debug = debug
1621 if errorlevel is not None:
1622 self.errorlevel = errorlevel
1623
1624 # Init datastructures.
Thomas Wouters477c8d52006-05-27 19:21:47 +00001625 self.closed = False
1626 self.members = [] # list of members as TarInfo objects
1627 self._loaded = False # flag if all members have been read
Christian Heimesd8654cf2007-12-02 15:22:16 +00001628 self.offset = self.fileobj.tell()
1629 # current position in the archive file
Thomas Wouters477c8d52006-05-27 19:21:47 +00001630 self.inodes = {} # dictionary caching the inodes of
1631 # archive members already added
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001632
Lars Gustäbel7b465392009-11-18 20:29:25 +00001633 try:
1634 if self.mode == "r":
1635 self.firstmember = None
1636 self.firstmember = self.next()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001637
Lars Gustäbel7b465392009-11-18 20:29:25 +00001638 if self.mode == "a":
1639 # Move to the end of the archive,
1640 # before the first empty block.
Lars Gustäbel7b465392009-11-18 20:29:25 +00001641 while True:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001642 self.fileobj.seek(self.offset)
1643 try:
1644 tarinfo = self.tarinfo.fromtarfile(self)
1645 self.members.append(tarinfo)
1646 except EOFHeaderError:
1647 self.fileobj.seek(self.offset)
Lars Gustäbel7b465392009-11-18 20:29:25 +00001648 break
Lars Gustäbel9520a432009-11-22 18:48:49 +00001649 except HeaderError as e:
1650 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001651
Lars Gustäbel7b465392009-11-18 20:29:25 +00001652 if self.mode in "aw":
1653 self._loaded = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001654
Lars Gustäbel7b465392009-11-18 20:29:25 +00001655 if self.pax_headers:
1656 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1657 self.fileobj.write(buf)
1658 self.offset += len(buf)
1659 except:
1660 if not self._extfileobj:
1661 self.fileobj.close()
1662 self.closed = True
1663 raise
Guido van Rossumd8faa362007-04-27 19:54:29 +00001664
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001665 #--------------------------------------------------------------------------
1666 # Below are the classmethods which act as alternate constructors to the
1667 # TarFile class. The open() method is the only one that is needed for
1668 # public use; it is the "super"-constructor and is able to select an
1669 # adequate "sub"-constructor for a particular compression using the mapping
1670 # from OPEN_METH.
1671 #
1672 # This concept allows one to subclass TarFile without losing the comfort of
1673 # the super-constructor. A sub-constructor is registered and made available
1674 # by adding it to the mapping in OPEN_METH.
1675
Guido van Rossum75b64e62005-01-16 00:16:11 +00001676 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001677 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001678 """Open a tar archive for reading, writing or appending. Return
1679 an appropriate TarFile class.
1680
1681 mode:
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001682 'r' or 'r:*' open for reading with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001683 'r:' open for reading exclusively uncompressed
1684 'r:gz' open for reading with gzip compression
1685 'r:bz2' open for reading with bzip2 compression
Thomas Wouterscf297e42007-02-23 15:07:44 +00001686 'a' or 'a:' open for appending, creating the file if necessary
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001687 'w' or 'w:' open for writing without compression
1688 'w:gz' open for writing with gzip compression
1689 'w:bz2' open for writing with bzip2 compression
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001690
1691 'r|*' open a stream of tar blocks with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001692 'r|' open an uncompressed stream of tar blocks for reading
1693 'r|gz' open a gzip compressed stream of tar blocks
1694 'r|bz2' open a bzip2 compressed stream of tar blocks
1695 'w|' open an uncompressed stream for writing
1696 'w|gz' open a gzip compressed stream for writing
1697 'w|bz2' open a bzip2 compressed stream for writing
1698 """
1699
1700 if not name and not fileobj:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001701 raise ValueError("nothing to open")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001702
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001703 if mode in ("r", "r:*"):
1704 # Find out which *open() is appropriate for opening the file.
1705 for comptype in cls.OPEN_METH:
1706 func = getattr(cls, cls.OPEN_METH[comptype])
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001707 if fileobj is not None:
1708 saved_pos = fileobj.tell()
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001709 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001710 return func(name, "r", fileobj, **kwargs)
1711 except (ReadError, CompressionError) as e:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001712 if fileobj is not None:
1713 fileobj.seek(saved_pos)
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001714 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001715 raise ReadError("file could not be opened successfully")
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001716
1717 elif ":" in mode:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001718 filemode, comptype = mode.split(":", 1)
1719 filemode = filemode or "r"
1720 comptype = comptype or "tar"
1721
1722 # Select the *open() function according to
1723 # given compression.
1724 if comptype in cls.OPEN_METH:
1725 func = getattr(cls, cls.OPEN_METH[comptype])
1726 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001727 raise CompressionError("unknown compression type %r" % comptype)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001728 return func(name, filemode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001729
1730 elif "|" in mode:
1731 filemode, comptype = mode.split("|", 1)
1732 filemode = filemode or "r"
1733 comptype = comptype or "tar"
1734
1735 if filemode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001736 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001737
Antoine Pitrou605c2932010-09-23 20:15:14 +00001738 stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1739 try:
1740 t = cls(name, filemode, stream, **kwargs)
1741 except:
1742 stream.close()
1743 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001744 t._extfileobj = False
1745 return t
1746
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001747 elif mode in "aw":
Guido van Rossumd8faa362007-04-27 19:54:29 +00001748 return cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001749
Thomas Wouters477c8d52006-05-27 19:21:47 +00001750 raise ValueError("undiscernible mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001751
Guido van Rossum75b64e62005-01-16 00:16:11 +00001752 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001753 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001754 """Open uncompressed tar archive name for reading or writing.
1755 """
1756 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001757 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001758 return cls(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001759
Guido van Rossum75b64e62005-01-16 00:16:11 +00001760 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001761 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001762 """Open gzip compressed tar archive name for reading or writing.
1763 Appending is not allowed.
1764 """
1765 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001766 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001767
1768 try:
1769 import gzip
Neal Norwitz4ec68242003-04-11 03:05:56 +00001770 gzip.GzipFile
1771 except (ImportError, AttributeError):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001772 raise CompressionError("gzip module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001773
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001774 if fileobj is None:
Guido van Rossume7ba4952007-06-06 23:52:48 +00001775 fileobj = bltn_open(name, mode + "b")
Antoine Pitrou95f55602010-09-23 18:36:46 +00001776 extfileobj = False
1777 else:
1778 extfileobj = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001779
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001780 try:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001781 t = cls.taropen(name, mode,
Guido van Rossumd8faa362007-04-27 19:54:29 +00001782 gzip.GzipFile(name, mode, compresslevel, fileobj),
1783 **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001784 except IOError:
Antoine Pitrou95f55602010-09-23 18:36:46 +00001785 if not extfileobj:
1786 fileobj.close()
Thomas Wouters477c8d52006-05-27 19:21:47 +00001787 raise ReadError("not a gzip file")
Antoine Pitrou95f55602010-09-23 18:36:46 +00001788 t._extfileobj = extfileobj
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001789 return t
1790
Guido van Rossum75b64e62005-01-16 00:16:11 +00001791 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001792 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001793 """Open bzip2 compressed tar archive name for reading or writing.
1794 Appending is not allowed.
1795 """
1796 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001797 raise ValueError("mode must be 'r' or 'w'.")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001798
1799 try:
1800 import bz2
1801 except ImportError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001802 raise CompressionError("bz2 module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001803
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001804 if fileobj is not None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001805 fileobj = _BZ2Proxy(fileobj, mode)
1806 else:
1807 fileobj = bz2.BZ2File(name, mode, compresslevel=compresslevel)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001808
1809 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001810 t = cls.taropen(name, mode, fileobj, **kwargs)
Lars Gustäbel9520a432009-11-22 18:48:49 +00001811 except (IOError, EOFError):
Antoine Pitrou95f55602010-09-23 18:36:46 +00001812 fileobj.close()
Thomas Wouters477c8d52006-05-27 19:21:47 +00001813 raise ReadError("not a bzip2 file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001814 t._extfileobj = False
1815 return t
1816
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001817 # All *open() methods are registered here.
1818 OPEN_METH = {
1819 "tar": "taropen", # uncompressed tar
1820 "gz": "gzopen", # gzip compressed tar
1821 "bz2": "bz2open" # bzip2 compressed tar
1822 }
1823
1824 #--------------------------------------------------------------------------
1825 # The public methods which TarFile provides:
1826
1827 def close(self):
1828 """Close the TarFile. In write-mode, two finishing zero blocks are
1829 appended to the archive.
1830 """
1831 if self.closed:
1832 return
1833
Guido van Rossumd8faa362007-04-27 19:54:29 +00001834 if self.mode in "aw":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001835 self.fileobj.write(NUL * (BLOCKSIZE * 2))
1836 self.offset += (BLOCKSIZE * 2)
1837 # fill up the end with zero-blocks
1838 # (like option -b20 for tar does)
1839 blocks, remainder = divmod(self.offset, RECORDSIZE)
1840 if remainder > 0:
1841 self.fileobj.write(NUL * (RECORDSIZE - remainder))
1842
1843 if not self._extfileobj:
1844 self.fileobj.close()
1845 self.closed = True
1846
1847 def getmember(self, name):
1848 """Return a TarInfo object for member `name'. If `name' can not be
1849 found in the archive, KeyError is raised. If a member occurs more
Mark Dickinson934896d2009-02-21 20:59:32 +00001850 than once in the archive, its last occurrence is assumed to be the
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001851 most up-to-date version.
1852 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001853 tarinfo = self._getmember(name)
1854 if tarinfo is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001855 raise KeyError("filename %r not found" % name)
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001856 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001857
1858 def getmembers(self):
1859 """Return the members of the archive as a list of TarInfo objects. The
1860 list has the same order as the members in the archive.
1861 """
1862 self._check()
1863 if not self._loaded: # if we want to obtain a list of
1864 self._load() # all members, we first have to
1865 # scan the whole archive.
1866 return self.members
1867
1868 def getnames(self):
1869 """Return the members of the archive as a list of their names. It has
1870 the same order as the list returned by getmembers().
1871 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001872 return [tarinfo.name for tarinfo in self.getmembers()]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001873
1874 def gettarinfo(self, name=None, arcname=None, fileobj=None):
1875 """Create a TarInfo object for either the file `name' or the file
1876 object `fileobj' (using os.fstat on its file descriptor). You can
1877 modify some of the TarInfo's attributes before you add it using
1878 addfile(). If given, `arcname' specifies an alternative name for the
1879 file in the archive.
1880 """
1881 self._check("aw")
1882
1883 # When fileobj is given, replace name by
1884 # fileobj's real name.
1885 if fileobj is not None:
1886 name = fileobj.name
1887
1888 # Building the name of the member in the archive.
1889 # Backward slashes are converted to forward slashes,
1890 # Absolute paths are turned to relative paths.
1891 if arcname is None:
1892 arcname = name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001893 drv, arcname = os.path.splitdrive(arcname)
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00001894 arcname = arcname.replace(os.sep, "/")
1895 arcname = arcname.lstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001896
1897 # Now, fill the TarInfo object with
1898 # information specific for the file.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001899 tarinfo = self.tarinfo()
1900 tarinfo.tarfile = self
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001901
1902 # Use os.stat or os.lstat, depending on platform
1903 # and if symlinks shall be resolved.
1904 if fileobj is None:
1905 if hasattr(os, "lstat") and not self.dereference:
1906 statres = os.lstat(name)
1907 else:
1908 statres = os.stat(name)
1909 else:
1910 statres = os.fstat(fileobj.fileno())
1911 linkname = ""
1912
1913 stmd = statres.st_mode
1914 if stat.S_ISREG(stmd):
1915 inode = (statres.st_ino, statres.st_dev)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001916 if not self.dereference and statres.st_nlink > 1 and \
1917 inode in self.inodes and arcname != self.inodes[inode]:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001918 # Is it a hardlink to an already
1919 # archived file?
1920 type = LNKTYPE
1921 linkname = self.inodes[inode]
1922 else:
1923 # The inode is added only if its valid.
1924 # For win32 it is always 0.
1925 type = REGTYPE
1926 if inode[0]:
1927 self.inodes[inode] = arcname
1928 elif stat.S_ISDIR(stmd):
1929 type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001930 elif stat.S_ISFIFO(stmd):
1931 type = FIFOTYPE
1932 elif stat.S_ISLNK(stmd):
1933 type = SYMTYPE
1934 linkname = os.readlink(name)
1935 elif stat.S_ISCHR(stmd):
1936 type = CHRTYPE
1937 elif stat.S_ISBLK(stmd):
1938 type = BLKTYPE
1939 else:
1940 return None
1941
1942 # Fill the TarInfo object with all
1943 # information we can get.
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001944 tarinfo.name = arcname
1945 tarinfo.mode = stmd
1946 tarinfo.uid = statres.st_uid
1947 tarinfo.gid = statres.st_gid
Lars Gustäbel2470ff12010-06-03 10:11:52 +00001948 if type == REGTYPE:
Martin v. Löwis61d77e02004-08-20 06:35:46 +00001949 tarinfo.size = statres.st_size
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001950 else:
Guido van Rossume2a383d2007-01-15 16:59:06 +00001951 tarinfo.size = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001952 tarinfo.mtime = statres.st_mtime
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001953 tarinfo.type = type
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001954 tarinfo.linkname = linkname
1955 if pwd:
1956 try:
1957 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1958 except KeyError:
1959 pass
1960 if grp:
1961 try:
1962 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1963 except KeyError:
1964 pass
1965
1966 if type in (CHRTYPE, BLKTYPE):
1967 if hasattr(os, "major") and hasattr(os, "minor"):
1968 tarinfo.devmajor = os.major(statres.st_rdev)
1969 tarinfo.devminor = os.minor(statres.st_rdev)
1970 return tarinfo
1971
1972 def list(self, verbose=True):
1973 """Print a table of contents to sys.stdout. If `verbose' is False, only
1974 the names of the members are printed. If it is True, an `ls -l'-like
1975 output is produced.
1976 """
1977 self._check()
1978
1979 for tarinfo in self:
1980 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001981 print(filemode(tarinfo.mode), end=' ')
1982 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
1983 tarinfo.gname or tarinfo.gid), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001984 if tarinfo.ischr() or tarinfo.isblk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001985 print("%10s" % ("%d,%d" \
1986 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001987 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001988 print("%10d" % tarinfo.size, end=' ')
1989 print("%d-%02d-%02d %02d:%02d:%02d" \
1990 % time.localtime(tarinfo.mtime)[:6], end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001991
Guido van Rossumd8faa362007-04-27 19:54:29 +00001992 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001993
1994 if verbose:
1995 if tarinfo.issym():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001996 print("->", tarinfo.linkname, end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001997 if tarinfo.islnk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001998 print("link to", tarinfo.linkname, end=' ')
1999 print()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002000
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002001 def add(self, name, arcname=None, recursive=True, exclude=None, filter=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002002 """Add the file `name' to the archive. `name' may be any type of file
2003 (directory, fifo, symbolic link, etc.). If given, `arcname'
2004 specifies an alternative name for the file in the archive.
2005 Directories are added recursively by default. This can be avoided by
Guido van Rossum486364b2007-06-30 05:01:58 +00002006 setting `recursive' to False. `exclude' is a function that should
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002007 return True for each filename to be excluded. `filter' is a function
2008 that expects a TarInfo object argument and returns the changed
2009 TarInfo object, if it returns None the TarInfo object will be
2010 excluded from the archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002011 """
2012 self._check("aw")
2013
2014 if arcname is None:
2015 arcname = name
2016
Guido van Rossum486364b2007-06-30 05:01:58 +00002017 # Exclude pathnames.
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002018 if exclude is not None:
2019 import warnings
2020 warnings.warn("use the filter argument instead",
2021 DeprecationWarning, 2)
2022 if exclude(name):
2023 self._dbg(2, "tarfile: Excluded %r" % name)
2024 return
Guido van Rossum486364b2007-06-30 05:01:58 +00002025
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002026 # Skip if somebody tries to archive the archive...
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002027 if self.name is not None and os.path.abspath(name) == self.name:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002028 self._dbg(2, "tarfile: Skipped %r" % name)
2029 return
2030
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002031 self._dbg(1, name)
2032
2033 # Create a TarInfo object from the file.
2034 tarinfo = self.gettarinfo(name, arcname)
2035
2036 if tarinfo is None:
2037 self._dbg(1, "tarfile: Unsupported type %r" % name)
2038 return
2039
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002040 # Change or exclude the TarInfo object.
2041 if filter is not None:
2042 tarinfo = filter(tarinfo)
2043 if tarinfo is None:
2044 self._dbg(2, "tarfile: Excluded %r" % name)
2045 return
2046
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002047 # Append the tar header and data to the archive.
2048 if tarinfo.isreg():
Guido van Rossume7ba4952007-06-06 23:52:48 +00002049 f = bltn_open(name, "rb")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002050 self.addfile(tarinfo, f)
2051 f.close()
2052
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002053 elif tarinfo.isdir():
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002054 self.addfile(tarinfo)
2055 if recursive:
2056 for f in os.listdir(name):
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002057 self.add(os.path.join(name, f), os.path.join(arcname, f),
2058 recursive, exclude, filter)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002059
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002060 else:
2061 self.addfile(tarinfo)
2062
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002063 def addfile(self, tarinfo, fileobj=None):
2064 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2065 given, tarinfo.size bytes are read from it and added to the archive.
2066 You can create TarInfo objects using gettarinfo().
2067 On Windows platforms, `fileobj' should always be opened with mode
2068 'rb' to avoid irritation about the file size.
2069 """
2070 self._check("aw")
2071
Thomas Wouters89f507f2006-12-13 04:49:30 +00002072 tarinfo = copy.copy(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002073
Guido van Rossume7ba4952007-06-06 23:52:48 +00002074 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002075 self.fileobj.write(buf)
2076 self.offset += len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002077
2078 # If there's data to follow, append it.
2079 if fileobj is not None:
2080 copyfileobj(fileobj, self.fileobj, tarinfo.size)
2081 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
2082 if remainder > 0:
2083 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2084 blocks += 1
2085 self.offset += blocks * BLOCKSIZE
2086
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002087 self.members.append(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002088
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002089 def extractall(self, path=".", members=None):
2090 """Extract all members from the archive to the current working
2091 directory and set owner, modification time and permissions on
2092 directories afterwards. `path' specifies a different directory
2093 to extract to. `members' is optional and must be a subset of the
2094 list returned by getmembers().
2095 """
2096 directories = []
2097
2098 if members is None:
2099 members = self
2100
2101 for tarinfo in members:
2102 if tarinfo.isdir():
Christian Heimes2202f872008-02-06 14:31:34 +00002103 # Extract directories with a safe mode.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002104 directories.append(tarinfo)
Christian Heimes2202f872008-02-06 14:31:34 +00002105 tarinfo = copy.copy(tarinfo)
2106 tarinfo.mode = 0o700
2107 self.extract(tarinfo, path)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002108
2109 # Reverse sort directories.
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +00002110 directories.sort(key=lambda a: a.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002111 directories.reverse()
2112
2113 # Set correct owner, mtime and filemode on directories.
2114 for tarinfo in directories:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002115 dirpath = os.path.join(path, tarinfo.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002116 try:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002117 self.chown(tarinfo, dirpath)
2118 self.utime(tarinfo, dirpath)
2119 self.chmod(tarinfo, dirpath)
Guido van Rossumb940e112007-01-10 16:19:56 +00002120 except ExtractError as e:
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002121 if self.errorlevel > 1:
2122 raise
2123 else:
2124 self._dbg(1, "tarfile: %s" % e)
2125
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002126 def extract(self, member, path=""):
2127 """Extract a member from the archive to the current working directory,
2128 using its full name. Its file information is extracted as accurately
2129 as possible. `member' may be a filename or a TarInfo object. You can
2130 specify a different directory using `path'.
2131 """
2132 self._check("r")
2133
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002134 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002135 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002136 else:
2137 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002138
Neal Norwitza4f651a2004-07-20 22:07:44 +00002139 # Prepare the link target for makelink().
2140 if tarinfo.islnk():
2141 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2142
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002143 try:
2144 self._extract_member(tarinfo, os.path.join(path, tarinfo.name))
Guido van Rossumb940e112007-01-10 16:19:56 +00002145 except EnvironmentError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002146 if self.errorlevel > 0:
2147 raise
2148 else:
2149 if e.filename is None:
2150 self._dbg(1, "tarfile: %s" % e.strerror)
2151 else:
2152 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
Guido van Rossumb940e112007-01-10 16:19:56 +00002153 except ExtractError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002154 if self.errorlevel > 1:
2155 raise
2156 else:
2157 self._dbg(1, "tarfile: %s" % e)
2158
2159 def extractfile(self, member):
2160 """Extract a member from the archive as a file object. `member' may be
2161 a filename or a TarInfo object. If `member' is a regular file, a
2162 file-like object is returned. If `member' is a link, a file-like
2163 object is constructed from the link's target. If `member' is none of
2164 the above, None is returned.
2165 The file-like object is read-only and provides the following
2166 methods: read(), readline(), readlines(), seek() and tell()
2167 """
2168 self._check("r")
2169
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002170 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002171 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002172 else:
2173 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002174
2175 if tarinfo.isreg():
2176 return self.fileobject(self, tarinfo)
2177
2178 elif tarinfo.type not in SUPPORTED_TYPES:
2179 # If a member's type is unknown, it is treated as a
2180 # regular file.
2181 return self.fileobject(self, tarinfo)
2182
2183 elif tarinfo.islnk() or tarinfo.issym():
2184 if isinstance(self.fileobj, _Stream):
2185 # A small but ugly workaround for the case that someone tries
2186 # to extract a (sym)link as a file-object from a non-seekable
2187 # stream of tar blocks.
Thomas Wouters477c8d52006-05-27 19:21:47 +00002188 raise StreamError("cannot extract (sym)link as file object")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002189 else:
Georg Brandl7eb4b7d2005-07-22 21:49:32 +00002190 # A (sym)link's file object is its target's file object.
Lars Gustäbel1b512722010-06-03 12:45:16 +00002191 return self.extractfile(self._find_link_target(tarinfo))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002192 else:
2193 # If there's no data associated with the member (directory, chrdev,
2194 # blkdev, etc.), return None instead of a file object.
2195 return None
2196
2197 def _extract_member(self, tarinfo, targetpath):
2198 """Extract the TarInfo object tarinfo to a physical
2199 file called targetpath.
2200 """
2201 # Fetch the TarInfo object for the given name
2202 # and build the destination pathname, replacing
2203 # forward slashes to platform specific separators.
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002204 targetpath = targetpath.rstrip("/")
2205 targetpath = targetpath.replace("/", os.sep)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002206
2207 # Create all upper directories.
2208 upperdirs = os.path.dirname(targetpath)
2209 if upperdirs and not os.path.exists(upperdirs):
Christian Heimes2202f872008-02-06 14:31:34 +00002210 # Create directories that are not part of the archive with
2211 # default permissions.
Thomas Woutersb2137042007-02-01 18:02:27 +00002212 os.makedirs(upperdirs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002213
2214 if tarinfo.islnk() or tarinfo.issym():
2215 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2216 else:
2217 self._dbg(1, tarinfo.name)
2218
2219 if tarinfo.isreg():
2220 self.makefile(tarinfo, targetpath)
2221 elif tarinfo.isdir():
2222 self.makedir(tarinfo, targetpath)
2223 elif tarinfo.isfifo():
2224 self.makefifo(tarinfo, targetpath)
2225 elif tarinfo.ischr() or tarinfo.isblk():
2226 self.makedev(tarinfo, targetpath)
2227 elif tarinfo.islnk() or tarinfo.issym():
2228 self.makelink(tarinfo, targetpath)
2229 elif tarinfo.type not in SUPPORTED_TYPES:
2230 self.makeunknown(tarinfo, targetpath)
2231 else:
2232 self.makefile(tarinfo, targetpath)
2233
2234 self.chown(tarinfo, targetpath)
2235 if not tarinfo.issym():
2236 self.chmod(tarinfo, targetpath)
2237 self.utime(tarinfo, targetpath)
2238
2239 #--------------------------------------------------------------------------
2240 # Below are the different file methods. They are called via
2241 # _extract_member() when extract() is called. They can be replaced in a
2242 # subclass to implement other functionality.
2243
2244 def makedir(self, tarinfo, targetpath):
2245 """Make a directory called targetpath.
2246 """
2247 try:
Christian Heimes2202f872008-02-06 14:31:34 +00002248 # Use a safe mode for the directory, the real mode is set
2249 # later in _extract_member().
2250 os.mkdir(targetpath, 0o700)
Guido van Rossumb940e112007-01-10 16:19:56 +00002251 except EnvironmentError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002252 if e.errno != errno.EEXIST:
2253 raise
2254
2255 def makefile(self, tarinfo, targetpath):
2256 """Make a file called targetpath.
2257 """
2258 source = self.extractfile(tarinfo)
Guido van Rossume7ba4952007-06-06 23:52:48 +00002259 target = bltn_open(targetpath, "wb")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002260 copyfileobj(source, target)
2261 source.close()
2262 target.close()
2263
2264 def makeunknown(self, tarinfo, targetpath):
2265 """Make a file from a TarInfo object with an unknown type
2266 at targetpath.
2267 """
2268 self.makefile(tarinfo, targetpath)
2269 self._dbg(1, "tarfile: Unknown file type %r, " \
2270 "extracted as regular file." % tarinfo.type)
2271
2272 def makefifo(self, tarinfo, targetpath):
2273 """Make a fifo called targetpath.
2274 """
2275 if hasattr(os, "mkfifo"):
2276 os.mkfifo(targetpath)
2277 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002278 raise ExtractError("fifo not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002279
2280 def makedev(self, tarinfo, targetpath):
2281 """Make a character or block device called targetpath.
2282 """
2283 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00002284 raise ExtractError("special devices not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002285
2286 mode = tarinfo.mode
2287 if tarinfo.isblk():
2288 mode |= stat.S_IFBLK
2289 else:
2290 mode |= stat.S_IFCHR
2291
2292 os.mknod(targetpath, mode,
2293 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2294
2295 def makelink(self, tarinfo, targetpath):
2296 """Make a (symbolic) link called targetpath. If it cannot be created
2297 (platform limitation), we try to make a copy of the referenced file
2298 instead of a link.
2299 """
Brian Curtind40e6f72010-07-08 21:39:08 +00002300 try:
Lars Gustäbel1b512722010-06-03 12:45:16 +00002301 # For systems that support symbolic and hard links.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002302 if tarinfo.issym():
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002303 os.symlink(tarinfo.linkname, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002304 else:
Neal Norwitza4f651a2004-07-20 22:07:44 +00002305 # See extract().
Lars Gustäbel1b512722010-06-03 12:45:16 +00002306 if os.path.exists(tarinfo._link_target):
2307 os.link(tarinfo._link_target, targetpath)
2308 else:
Brian Curtin82df53e2010-09-24 21:04:05 +00002309 self._extract_member(self._find_link_target(tarinfo),
2310 targetpath)
Brian Curtin16633fa2010-07-09 13:54:27 +00002311 except symlink_exception:
Brian Curtind40e6f72010-07-08 21:39:08 +00002312 if tarinfo.issym():
Brian Curtin16633fa2010-07-09 13:54:27 +00002313 linkpath = os.path.join(os.path.dirname(tarinfo.name),
2314 tarinfo.linkname)
Brian Curtind40e6f72010-07-08 21:39:08 +00002315 else:
2316 linkpath = tarinfo.linkname
Lars Gustäbel1b512722010-06-03 12:45:16 +00002317 else:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002318 try:
Brian Curtin16633fa2010-07-09 13:54:27 +00002319 self._extract_member(self._find_link_target(tarinfo),
2320 targetpath)
Lars Gustäbel1b512722010-06-03 12:45:16 +00002321 except KeyError:
2322 raise ExtractError("unable to resolve link inside archive")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002323
2324 def chown(self, tarinfo, targetpath):
2325 """Set owner of targetpath according to tarinfo.
2326 """
2327 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2328 # We have to be root to do so.
2329 try:
2330 g = grp.getgrnam(tarinfo.gname)[2]
2331 except KeyError:
2332 try:
2333 g = grp.getgrgid(tarinfo.gid)[2]
2334 except KeyError:
2335 g = os.getgid()
2336 try:
2337 u = pwd.getpwnam(tarinfo.uname)[2]
2338 except KeyError:
2339 try:
2340 u = pwd.getpwuid(tarinfo.uid)[2]
2341 except KeyError:
2342 u = os.getuid()
2343 try:
2344 if tarinfo.issym() and hasattr(os, "lchown"):
2345 os.lchown(targetpath, u, g)
2346 else:
Andrew MacIntyre7970d202003-02-19 12:51:34 +00002347 if sys.platform != "os2emx":
2348 os.chown(targetpath, u, g)
Guido van Rossumb940e112007-01-10 16:19:56 +00002349 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002350 raise ExtractError("could not change owner")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002351
2352 def chmod(self, tarinfo, targetpath):
2353 """Set file permissions of targetpath according to tarinfo.
2354 """
Jack Jansen834eff62003-03-07 12:47:06 +00002355 if hasattr(os, 'chmod'):
2356 try:
2357 os.chmod(targetpath, tarinfo.mode)
Guido van Rossumb940e112007-01-10 16:19:56 +00002358 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002359 raise ExtractError("could not change mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002360
2361 def utime(self, tarinfo, targetpath):
2362 """Set modification time of targetpath according to tarinfo.
2363 """
Jack Jansen834eff62003-03-07 12:47:06 +00002364 if not hasattr(os, 'utime'):
Tim Petersf9347782003-03-07 15:36:41 +00002365 return
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002366 try:
2367 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
Guido van Rossumb940e112007-01-10 16:19:56 +00002368 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002369 raise ExtractError("could not change modification time")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002370
2371 #--------------------------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002372 def next(self):
2373 """Return the next member of the archive as a TarInfo object, when
2374 TarFile is opened for reading. Return None if there is no more
2375 available.
2376 """
2377 self._check("ra")
2378 if self.firstmember is not None:
2379 m = self.firstmember
2380 self.firstmember = None
2381 return m
2382
2383 # Read the next block.
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002384 self.fileobj.seek(self.offset)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002385 tarinfo = None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002386 while True:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002387 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00002388 tarinfo = self.tarinfo.fromtarfile(self)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002389 except EOFHeaderError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002390 if self.ignore_zeros:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002391 self._dbg(2, "0x%X: %s" % (self.offset, e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002392 self.offset += BLOCKSIZE
2393 continue
Lars Gustäbel9520a432009-11-22 18:48:49 +00002394 except InvalidHeaderError as e:
2395 if self.ignore_zeros:
2396 self._dbg(2, "0x%X: %s" % (self.offset, e))
2397 self.offset += BLOCKSIZE
2398 continue
2399 elif self.offset == 0:
2400 raise ReadError(str(e))
2401 except EmptyHeaderError:
2402 if self.offset == 0:
2403 raise ReadError("empty file")
2404 except TruncatedHeaderError as e:
2405 if self.offset == 0:
2406 raise ReadError(str(e))
2407 except SubsequentHeaderError as e:
2408 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002409 break
2410
Lars Gustäbel9520a432009-11-22 18:48:49 +00002411 if tarinfo is not None:
2412 self.members.append(tarinfo)
2413 else:
2414 self._loaded = True
2415
Thomas Wouters477c8d52006-05-27 19:21:47 +00002416 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002417
2418 #--------------------------------------------------------------------------
2419 # Little helper methods:
2420
Lars Gustäbel1b512722010-06-03 12:45:16 +00002421 def _getmember(self, name, tarinfo=None, normalize=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002422 """Find an archive member by name from bottom to top.
2423 If tarinfo is given, it is used as the starting point.
2424 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002425 # Ensure that all members have been loaded.
2426 members = self.getmembers()
2427
Lars Gustäbel1b512722010-06-03 12:45:16 +00002428 # Limit the member search list up to tarinfo.
2429 if tarinfo is not None:
2430 members = members[:members.index(tarinfo)]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002431
Lars Gustäbel1b512722010-06-03 12:45:16 +00002432 if normalize:
2433 name = os.path.normpath(name)
2434
2435 for member in reversed(members):
2436 if normalize:
2437 member_name = os.path.normpath(member.name)
2438 else:
2439 member_name = member.name
2440
2441 if name == member_name:
2442 return member
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002443
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002444 def _load(self):
2445 """Read through the entire archive file and look for readable
2446 members.
2447 """
2448 while True:
2449 tarinfo = self.next()
2450 if tarinfo is None:
2451 break
2452 self._loaded = True
2453
2454 def _check(self, mode=None):
2455 """Check if TarFile is still open, and if the operation's mode
2456 corresponds to TarFile's mode.
2457 """
2458 if self.closed:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002459 raise IOError("%s is closed" % self.__class__.__name__)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002460 if mode is not None and self.mode not in mode:
2461 raise IOError("bad operation for mode %r" % self.mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002462
Lars Gustäbel1b512722010-06-03 12:45:16 +00002463 def _find_link_target(self, tarinfo):
2464 """Find the target member of a symlink or hardlink member in the
2465 archive.
2466 """
2467 if tarinfo.issym():
2468 # Always search the entire archive.
2469 linkname = os.path.dirname(tarinfo.name) + "/" + tarinfo.linkname
2470 limit = None
2471 else:
2472 # Search the archive before the link, because a hard link is
2473 # just a reference to an already archived file.
2474 linkname = tarinfo.linkname
2475 limit = tarinfo
2476
2477 member = self._getmember(linkname, tarinfo=limit, normalize=True)
2478 if member is None:
2479 raise KeyError("linkname %r not found" % linkname)
2480 return member
2481
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002482 def __iter__(self):
2483 """Provide an iterator object.
2484 """
2485 if self._loaded:
2486 return iter(self.members)
2487 else:
2488 return TarIter(self)
2489
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002490 def _dbg(self, level, msg):
2491 """Write debugging output to sys.stderr.
2492 """
2493 if level <= self.debug:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002494 print(msg, file=sys.stderr)
Lars Gustäbel01385812010-03-03 12:08:54 +00002495
2496 def __enter__(self):
2497 self._check()
2498 return self
2499
2500 def __exit__(self, type, value, traceback):
2501 if type is None:
2502 self.close()
2503 else:
2504 # An exception occurred. We must not call close() because
2505 # it would try to write end-of-archive blocks and padding.
2506 if not self._extfileobj:
2507 self.fileobj.close()
2508 self.closed = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002509# class TarFile
2510
2511class TarIter:
2512 """Iterator Class.
2513
2514 for tarinfo in TarFile(...):
2515 suite...
2516 """
2517
2518 def __init__(self, tarfile):
2519 """Construct a TarIter object.
2520 """
2521 self.tarfile = tarfile
Martin v. Löwis637431b2005-03-03 23:12:42 +00002522 self.index = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002523 def __iter__(self):
2524 """Return iterator object.
2525 """
2526 return self
Georg Brandla18af4e2007-04-21 15:47:16 +00002527 def __next__(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002528 """Return the next item using TarFile's next() method.
2529 When all members have been read, set TarFile as _loaded.
2530 """
Martin v. Löwis637431b2005-03-03 23:12:42 +00002531 # Fix for SF #1100429: Under rare circumstances it can
2532 # happen that getmembers() is called during iteration,
2533 # which will cause TarIter to stop prematurely.
2534 if not self.tarfile._loaded:
2535 tarinfo = self.tarfile.next()
2536 if not tarinfo:
2537 self.tarfile._loaded = True
2538 raise StopIteration
2539 else:
2540 try:
2541 tarinfo = self.tarfile.members[self.index]
2542 except IndexError:
2543 raise StopIteration
2544 self.index += 1
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002545 return tarinfo
2546
2547# Helper classes for sparse file support
2548class _section:
2549 """Base class for _data and _hole.
2550 """
2551 def __init__(self, offset, size):
2552 self.offset = offset
2553 self.size = size
2554 def __contains__(self, offset):
2555 return self.offset <= offset < self.offset + self.size
2556
2557class _data(_section):
2558 """Represent a data section in a sparse file.
2559 """
2560 def __init__(self, offset, size, realpos):
2561 _section.__init__(self, offset, size)
2562 self.realpos = realpos
2563
2564class _hole(_section):
2565 """Represent a hole section in a sparse file.
2566 """
2567 pass
2568
2569class _ringbuffer(list):
2570 """Ringbuffer class which increases performance
2571 over a regular list.
2572 """
2573 def __init__(self):
2574 self.idx = 0
2575 def find(self, offset):
2576 idx = self.idx
2577 while True:
2578 item = self[idx]
2579 if offset in item:
2580 break
2581 idx += 1
2582 if idx == len(self):
2583 idx = 0
2584 if idx == self.idx:
2585 # End of File
2586 return None
2587 self.idx = idx
2588 return item
2589
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002590#--------------------
2591# exported functions
2592#--------------------
2593def is_tarfile(name):
2594 """Return True if name points to a tar archive that we
2595 are able to handle, else return False.
2596 """
2597 try:
2598 t = open(name)
2599 t.close()
2600 return True
2601 except TarError:
2602 return False
2603
Guido van Rossume7ba4952007-06-06 23:52:48 +00002604bltn_open = open
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002605open = TarFile.open