blob: fd898c74a1b9722f1c44ba967f436c21ce1cf5b5 [file] [log] [blame]
Benjamin Peterson90f5ba52010-03-11 22:53:45 +00001#!/usr/bin/env python3
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
Christian Heimes9c1257e2007-11-04 11:37:22 +00005# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00006# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
32__version__ = "$Revision$"
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000033
Guido van Rossumd8faa362007-04-27 19:54:29 +000034version = "0.9.0"
Guido van Rossum98297ee2007-11-06 21:34:58 +000035__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)"
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000036__date__ = "$Date$"
37__cvsid__ = "$Id$"
Guido van Rossum98297ee2007-11-06 21:34:58 +000038__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000039
40#---------
41# Imports
42#---------
43import sys
44import os
45import shutil
46import stat
47import errno
48import time
49import struct
Thomas Wouters89f507f2006-12-13 04:49:30 +000050import copy
Guido van Rossumd8faa362007-04-27 19:54:29 +000051import re
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000052
53try:
54 import grp, pwd
55except ImportError:
56 grp = pwd = None
57
Brian Curtin16633fa2010-07-09 13:54:27 +000058# os.symlink on Windows prior to 6.0 raises NotImplementedError
59symlink_exception = (AttributeError, NotImplementedError)
60try:
61 # WindowsError (1314) will be raised if the caller does not hold the
62 # SeCreateSymbolicLinkPrivilege privilege
63 symlink_exception += (WindowsError,)
64except NameError:
65 pass
66
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000067# from tarfile import *
68__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
69
Georg Brandl1a3284e2007-12-02 09:40:06 +000070from builtins import open as _open # Since 'open' is TarFile.open
Guido van Rossum8f78fe92006-08-24 04:03:53 +000071
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000072#---------------------------------------------------------
73# tar constants
74#---------------------------------------------------------
Lars Gustäbelb506dc32007-08-07 18:36:16 +000075NUL = b"\0" # the null character
Guido van Rossumd8faa362007-04-27 19:54:29 +000076BLOCKSIZE = 512 # length of processing blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000077RECORDSIZE = BLOCKSIZE * 20 # length of records
Lars Gustäbelb506dc32007-08-07 18:36:16 +000078GNU_MAGIC = b"ustar \0" # magic gnu tar string
79POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000080
Guido van Rossumd8faa362007-04-27 19:54:29 +000081LENGTH_NAME = 100 # maximum length of a filename
82LENGTH_LINK = 100 # maximum length of a linkname
83LENGTH_PREFIX = 155 # maximum length of the prefix field
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000084
Lars Gustäbelb506dc32007-08-07 18:36:16 +000085REGTYPE = b"0" # regular file
86AREGTYPE = b"\0" # regular file
87LNKTYPE = b"1" # link (inside tarfile)
88SYMTYPE = b"2" # symbolic link
89CHRTYPE = b"3" # character special device
90BLKTYPE = b"4" # block special device
91DIRTYPE = b"5" # directory
92FIFOTYPE = b"6" # fifo special device
93CONTTYPE = b"7" # contiguous file
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000094
Lars Gustäbelb506dc32007-08-07 18:36:16 +000095GNUTYPE_LONGNAME = b"L" # GNU tar longname
96GNUTYPE_LONGLINK = b"K" # GNU tar longlink
97GNUTYPE_SPARSE = b"S" # GNU tar sparse file
Guido van Rossumd8faa362007-04-27 19:54:29 +000098
Lars Gustäbelb506dc32007-08-07 18:36:16 +000099XHDTYPE = b"x" # POSIX.1-2001 extended header
100XGLTYPE = b"g" # POSIX.1-2001 global header
101SOLARIS_XHDTYPE = b"X" # Solaris extended header
Guido van Rossumd8faa362007-04-27 19:54:29 +0000102
103USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
104GNU_FORMAT = 1 # GNU tar format
105PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
106DEFAULT_FORMAT = GNU_FORMAT
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000107
108#---------------------------------------------------------
109# tarfile constants
110#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000111# File types that tarfile supports:
112SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
113 SYMTYPE, DIRTYPE, FIFOTYPE,
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000114 CONTTYPE, CHRTYPE, BLKTYPE,
115 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
116 GNUTYPE_SPARSE)
117
Guido van Rossumd8faa362007-04-27 19:54:29 +0000118# File types that will be treated as a regular file.
119REGULAR_TYPES = (REGTYPE, AREGTYPE,
120 CONTTYPE, GNUTYPE_SPARSE)
121
122# File types that are part of the GNU tar format.
123GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
124 GNUTYPE_SPARSE)
125
126# Fields from a pax header that override a TarInfo attribute.
127PAX_FIELDS = ("path", "linkpath", "size", "mtime",
128 "uid", "gid", "uname", "gname")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000129
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000130# Fields from a pax header that are affected by hdrcharset.
131PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
132
Guido van Rossume7ba4952007-06-06 23:52:48 +0000133# Fields in a pax header that are numbers, all other fields
134# are treated as strings.
135PAX_NUMBER_FIELDS = {
136 "atime": float,
137 "ctime": float,
138 "mtime": float,
139 "uid": int,
140 "gid": int,
141 "size": int
142}
143
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000144#---------------------------------------------------------
145# Bits used in the mode field, values in octal.
146#---------------------------------------------------------
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000147S_IFLNK = 0o120000 # symbolic link
148S_IFREG = 0o100000 # regular file
149S_IFBLK = 0o060000 # block device
150S_IFDIR = 0o040000 # directory
151S_IFCHR = 0o020000 # character device
152S_IFIFO = 0o010000 # fifo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000153
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000154TSUID = 0o4000 # set UID on execution
155TSGID = 0o2000 # set GID on execution
156TSVTX = 0o1000 # reserved
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000157
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000158TUREAD = 0o400 # read by owner
159TUWRITE = 0o200 # write by owner
160TUEXEC = 0o100 # execute/search by owner
161TGREAD = 0o040 # read by group
162TGWRITE = 0o020 # write by group
163TGEXEC = 0o010 # execute/search by group
164TOREAD = 0o004 # read by other
165TOWRITE = 0o002 # write by other
166TOEXEC = 0o001 # execute/search by other
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000167
168#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000169# initialization
170#---------------------------------------------------------
Victor Stinner0f35e2c2010-06-11 23:46:47 +0000171if os.name in ("nt", "ce"):
172 ENCODING = "utf-8"
173else:
174 ENCODING = sys.getfilesystemencoding()
Guido van Rossumd8faa362007-04-27 19:54:29 +0000175
176#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000177# Some useful functions
178#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000179
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000180def stn(s, length, encoding, errors):
181 """Convert a string to a null-terminated bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000182 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000183 s = s.encode(encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000184 return s[:length] + (length - len(s)) * NUL
Thomas Wouters477c8d52006-05-27 19:21:47 +0000185
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000186def nts(s, encoding, errors):
187 """Convert a null-terminated bytes object to a string.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000188 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000189 p = s.find(b"\0")
190 if p != -1:
191 s = s[:p]
192 return s.decode(encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000193
Thomas Wouters477c8d52006-05-27 19:21:47 +0000194def nti(s):
195 """Convert a number field to a python number.
196 """
197 # There are two possible encodings for a number field, see
198 # itn() below.
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000199 if s[0] != chr(0o200):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000200 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000201 n = int(nts(s, "ascii", "strict") or "0", 8)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000202 except ValueError:
Lars Gustäbel9520a432009-11-22 18:48:49 +0000203 raise InvalidHeaderError("invalid header")
Thomas Wouters477c8d52006-05-27 19:21:47 +0000204 else:
Guido van Rossume2a383d2007-01-15 16:59:06 +0000205 n = 0
Guido van Rossum805365e2007-05-07 22:24:25 +0000206 for i in range(len(s) - 1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000207 n <<= 8
208 n += ord(s[i + 1])
209 return n
210
Guido van Rossumd8faa362007-04-27 19:54:29 +0000211def itn(n, digits=8, format=DEFAULT_FORMAT):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000212 """Convert a python number to a number field.
213 """
214 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
215 # octal digits followed by a null-byte, this allows values up to
216 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000217 # that if necessary. A leading 0o200 byte indicates this particular
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218 # encoding, the following digits-1 bytes are a big-endian
219 # representation. This allows values up to (256**(digits-1))-1.
220 if 0 <= n < 8 ** (digits - 1):
Lars Gustäbela280ca752007-08-28 07:34:33 +0000221 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
Thomas Wouters477c8d52006-05-27 19:21:47 +0000222 else:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000223 if format != GNU_FORMAT or n >= 256 ** (digits - 1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224 raise ValueError("overflow in number field")
225
226 if n < 0:
227 # XXX We mimic GNU tar's behaviour with negative numbers,
228 # this could raise OverflowError.
229 n = struct.unpack("L", struct.pack("l", n))[0]
230
Guido van Rossum254348e2007-11-21 19:29:53 +0000231 s = bytearray()
Guido van Rossum805365e2007-05-07 22:24:25 +0000232 for i in range(digits - 1):
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000233 s.insert(0, n & 0o377)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 n >>= 8
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000235 s.insert(0, 0o200)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000236 return s
237
238def calc_chksums(buf):
239 """Calculate the checksum for a member's header by summing up all
240 characters except for the chksum field which is treated as if
241 it was filled with spaces. According to the GNU tar sources,
242 some tars (Sun and NeXT) calculate chksum with signed char,
243 which will be different if there are chars in the buffer with
244 the high bit set. So we calculate two checksums, unsigned and
245 signed.
246 """
247 unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
248 signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
249 return unsigned_chksum, signed_chksum
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000250
251def copyfileobj(src, dst, length=None):
252 """Copy length bytes from fileobj src to fileobj dst.
253 If length is None, copy the entire content.
254 """
255 if length == 0:
256 return
257 if length is None:
258 shutil.copyfileobj(src, dst)
259 return
260
261 BUFSIZE = 16 * 1024
262 blocks, remainder = divmod(length, BUFSIZE)
Guido van Rossum805365e2007-05-07 22:24:25 +0000263 for b in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000264 buf = src.read(BUFSIZE)
265 if len(buf) < BUFSIZE:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000267 dst.write(buf)
268
269 if remainder != 0:
270 buf = src.read(remainder)
271 if len(buf) < remainder:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000273 dst.write(buf)
274 return
275
276filemode_table = (
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000277 ((S_IFLNK, "l"),
278 (S_IFREG, "-"),
279 (S_IFBLK, "b"),
280 (S_IFDIR, "d"),
281 (S_IFCHR, "c"),
282 (S_IFIFO, "p")),
283
284 ((TUREAD, "r"),),
285 ((TUWRITE, "w"),),
286 ((TUEXEC|TSUID, "s"),
287 (TSUID, "S"),
288 (TUEXEC, "x")),
289
290 ((TGREAD, "r"),),
291 ((TGWRITE, "w"),),
292 ((TGEXEC|TSGID, "s"),
293 (TSGID, "S"),
294 (TGEXEC, "x")),
295
296 ((TOREAD, "r"),),
297 ((TOWRITE, "w"),),
298 ((TOEXEC|TSVTX, "t"),
299 (TSVTX, "T"),
300 (TOEXEC, "x"))
301)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000302
303def filemode(mode):
304 """Convert a file's mode to a string of the form
305 -rwxrwxrwx.
306 Used by TarFile.list()
307 """
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000308 perm = []
309 for table in filemode_table:
310 for bit, char in table:
311 if mode & bit == bit:
312 perm.append(char)
313 break
314 else:
315 perm.append("-")
316 return "".join(perm)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000317
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000318class TarError(Exception):
319 """Base exception."""
320 pass
321class ExtractError(TarError):
322 """General exception for extract errors."""
323 pass
324class ReadError(TarError):
325 """Exception for unreadble tar archives."""
326 pass
327class CompressionError(TarError):
328 """Exception for unavailable compression methods."""
329 pass
330class StreamError(TarError):
331 """Exception for unsupported operations on stream-like TarFiles."""
332 pass
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000333class HeaderError(TarError):
Lars Gustäbel9520a432009-11-22 18:48:49 +0000334 """Base exception for header errors."""
335 pass
336class EmptyHeaderError(HeaderError):
337 """Exception for empty headers."""
338 pass
339class TruncatedHeaderError(HeaderError):
340 """Exception for truncated headers."""
341 pass
342class EOFHeaderError(HeaderError):
343 """Exception for end of file headers."""
344 pass
345class InvalidHeaderError(HeaderError):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000346 """Exception for invalid headers."""
347 pass
Lars Gustäbel9520a432009-11-22 18:48:49 +0000348class SubsequentHeaderError(HeaderError):
349 """Exception for missing and invalid extended headers."""
350 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000351
352#---------------------------
353# internal stream interface
354#---------------------------
355class _LowLevelFile:
356 """Low-level file object. Supports reading and writing.
357 It is used instead of a regular file object for streaming
358 access.
359 """
360
361 def __init__(self, name, mode):
362 mode = {
363 "r": os.O_RDONLY,
364 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
365 }[mode]
366 if hasattr(os, "O_BINARY"):
367 mode |= os.O_BINARY
Lars Gustäbeld6eb70b2010-04-29 15:37:02 +0000368 self.fd = os.open(name, mode, 0o666)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000369
370 def close(self):
371 os.close(self.fd)
372
373 def read(self, size):
374 return os.read(self.fd, size)
375
376 def write(self, s):
377 os.write(self.fd, s)
378
379class _Stream:
380 """Class that serves as an adapter between TarFile and
381 a stream-like object. The stream-like object only
382 needs to have a read() or write() method and is accessed
383 blockwise. Use of gzip or bzip2 compression is possible.
384 A stream-like object could be for example: sys.stdin,
385 sys.stdout, a socket, a tape device etc.
386
387 _Stream is intended to be used only internally.
388 """
389
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000390 def __init__(self, name, mode, comptype, fileobj, bufsize):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000391 """Construct a _Stream object.
392 """
393 self._extfileobj = True
394 if fileobj is None:
395 fileobj = _LowLevelFile(name, mode)
396 self._extfileobj = False
397
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000398 if comptype == '*':
399 # Enable transparent compression detection for the
400 # stream interface
401 fileobj = _StreamProxy(fileobj)
402 comptype = fileobj.getcomptype()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000403
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000404 self.name = name or ""
405 self.mode = mode
406 self.comptype = comptype
407 self.fileobj = fileobj
408 self.bufsize = bufsize
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000409 self.buf = b""
Guido van Rossume2a383d2007-01-15 16:59:06 +0000410 self.pos = 0
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000411 self.closed = False
412
Antoine Pitrou605c2932010-09-23 20:15:14 +0000413 try:
414 if comptype == "gz":
415 try:
416 import zlib
417 except ImportError:
418 raise CompressionError("zlib module is not available")
419 self.zlib = zlib
420 self.crc = zlib.crc32(b"")
421 if mode == "r":
422 self._init_read_gz()
423 else:
424 self._init_write_gz()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000425
Antoine Pitrou605c2932010-09-23 20:15:14 +0000426 if comptype == "bz2":
427 try:
428 import bz2
429 except ImportError:
430 raise CompressionError("bz2 module is not available")
431 if mode == "r":
432 self.dbuf = b""
433 self.cmp = bz2.BZ2Decompressor()
434 else:
435 self.cmp = bz2.BZ2Compressor()
436 except:
437 if not self._extfileobj:
438 self.fileobj.close()
439 self.closed = True
440 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000441
442 def __del__(self):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000443 if hasattr(self, "closed") and not self.closed:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000444 self.close()
445
446 def _init_write_gz(self):
447 """Initialize for writing with gzip compression.
448 """
449 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
450 -self.zlib.MAX_WBITS,
451 self.zlib.DEF_MEM_LEVEL,
452 0)
Guido van Rossume2a383d2007-01-15 16:59:06 +0000453 timestamp = struct.pack("<L", int(time.time()))
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000454 self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000455 if self.name.endswith(".gz"):
456 self.name = self.name[:-3]
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000457 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
458 self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000459
460 def write(self, s):
461 """Write string s to the stream.
462 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000463 if self.comptype == "gz":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000464 self.crc = self.zlib.crc32(s, self.crc)
465 self.pos += len(s)
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000466 if self.comptype != "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000467 s = self.cmp.compress(s)
468 self.__write(s)
469
470 def __write(self, s):
471 """Write string s to the stream if a whole new block
472 is ready to be written.
473 """
474 self.buf += s
475 while len(self.buf) > self.bufsize:
476 self.fileobj.write(self.buf[:self.bufsize])
477 self.buf = self.buf[self.bufsize:]
478
479 def close(self):
480 """Close the _Stream object. No operation should be
481 done on it afterwards.
482 """
483 if self.closed:
484 return
485
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000486 if self.mode == "w" and self.comptype != "tar":
Martin v. Löwisc234a522004-08-22 21:28:33 +0000487 self.buf += self.cmp.flush()
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000488
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000489 if self.mode == "w" and self.buf:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000490 self.fileobj.write(self.buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000491 self.buf = b""
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000492 if self.comptype == "gz":
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000493 # The native zlib crc is an unsigned 32-bit integer, but
494 # the Python wrapper implicitly casts that to a signed C
495 # long. So, on a 32-bit box self.crc may "look negative",
496 # while the same crc on a 64-bit box may "look positive".
497 # To avoid irksome warnings from the `struct` module, force
498 # it to look positive on all boxes.
Guido van Rossume2a383d2007-01-15 16:59:06 +0000499 self.fileobj.write(struct.pack("<L", self.crc & 0xffffffff))
500 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000501
502 if not self._extfileobj:
503 self.fileobj.close()
504
505 self.closed = True
506
507 def _init_read_gz(self):
508 """Initialize for reading a gzip compressed fileobj.
509 """
510 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000511 self.dbuf = b""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000512
513 # taken from gzip.GzipFile with some alterations
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000514 if self.__read(2) != b"\037\213":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000515 raise ReadError("not a gzip file")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000516 if self.__read(1) != b"\010":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000517 raise CompressionError("unsupported compression method")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000518
519 flag = ord(self.__read(1))
520 self.__read(6)
521
522 if flag & 4:
523 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
524 self.read(xlen)
525 if flag & 8:
526 while True:
527 s = self.__read(1)
528 if not s or s == NUL:
529 break
530 if flag & 16:
531 while True:
532 s = self.__read(1)
533 if not s or s == NUL:
534 break
535 if flag & 2:
536 self.__read(2)
537
538 def tell(self):
539 """Return the stream's file pointer position.
540 """
541 return self.pos
542
543 def seek(self, pos=0):
544 """Set the stream's file pointer to pos. Negative seeking
545 is forbidden.
546 """
547 if pos - self.pos >= 0:
548 blocks, remainder = divmod(pos - self.pos, self.bufsize)
Guido van Rossum805365e2007-05-07 22:24:25 +0000549 for i in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000550 self.read(self.bufsize)
551 self.read(remainder)
552 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553 raise StreamError("seeking backwards is not allowed")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000554 return self.pos
555
556 def read(self, size=None):
557 """Return the next size number of bytes from the stream.
558 If size is not defined, return all bytes of the stream
559 up to EOF.
560 """
561 if size is None:
562 t = []
563 while True:
564 buf = self._read(self.bufsize)
565 if not buf:
566 break
567 t.append(buf)
568 buf = "".join(t)
569 else:
570 buf = self._read(size)
571 self.pos += len(buf)
572 return buf
573
574 def _read(self, size):
575 """Return size bytes from the stream.
576 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000577 if self.comptype == "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000578 return self.__read(size)
579
580 c = len(self.dbuf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000581 while c < size:
582 buf = self.__read(self.bufsize)
583 if not buf:
584 break
Guido van Rossumd8faa362007-04-27 19:54:29 +0000585 try:
586 buf = self.cmp.decompress(buf)
587 except IOError:
588 raise ReadError("invalid compressed data")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000589 self.dbuf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000590 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000591 buf = self.dbuf[:size]
592 self.dbuf = self.dbuf[size:]
593 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000594
595 def __read(self, size):
596 """Return size bytes from stream. If internal buffer is empty,
597 read another block from the stream.
598 """
599 c = len(self.buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000600 while c < size:
601 buf = self.fileobj.read(self.bufsize)
602 if not buf:
603 break
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000604 self.buf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000605 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000606 buf = self.buf[:size]
607 self.buf = self.buf[size:]
608 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000609# class _Stream
610
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000611class _StreamProxy(object):
612 """Small proxy class that enables transparent compression
613 detection for the Stream interface (mode 'r|*').
614 """
615
616 def __init__(self, fileobj):
617 self.fileobj = fileobj
618 self.buf = self.fileobj.read(BLOCKSIZE)
619
620 def read(self, size):
621 self.read = self.fileobj.read
622 return self.buf
623
624 def getcomptype(self):
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000625 if self.buf.startswith(b"\037\213\010"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000626 return "gz"
Lars Gustäbela280ca752007-08-28 07:34:33 +0000627 if self.buf.startswith(b"BZh91"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000628 return "bz2"
629 return "tar"
630
631 def close(self):
632 self.fileobj.close()
633# class StreamProxy
634
Thomas Wouters477c8d52006-05-27 19:21:47 +0000635class _BZ2Proxy(object):
636 """Small proxy class that enables external file object
637 support for "r:bz2" and "w:bz2" modes. This is actually
638 a workaround for a limitation in bz2 module's BZ2File
639 class which (unlike gzip.GzipFile) has no support for
640 a file object argument.
641 """
642
643 blocksize = 16 * 1024
644
645 def __init__(self, fileobj, mode):
646 self.fileobj = fileobj
647 self.mode = mode
Guido van Rossumd8faa362007-04-27 19:54:29 +0000648 self.name = getattr(self.fileobj, "name", None)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000649 self.init()
650
651 def init(self):
652 import bz2
653 self.pos = 0
654 if self.mode == "r":
655 self.bz2obj = bz2.BZ2Decompressor()
656 self.fileobj.seek(0)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000657 self.buf = b""
Thomas Wouters477c8d52006-05-27 19:21:47 +0000658 else:
659 self.bz2obj = bz2.BZ2Compressor()
660
661 def read(self, size):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000662 x = len(self.buf)
663 while x < size:
Lars Gustäbel42e00912009-03-22 20:34:29 +0000664 raw = self.fileobj.read(self.blocksize)
665 if not raw:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000666 break
Lars Gustäbel42e00912009-03-22 20:34:29 +0000667 data = self.bz2obj.decompress(raw)
668 self.buf += data
Thomas Wouters477c8d52006-05-27 19:21:47 +0000669 x += len(data)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000670
671 buf = self.buf[:size]
672 self.buf = self.buf[size:]
673 self.pos += len(buf)
674 return buf
675
676 def seek(self, pos):
677 if pos < self.pos:
678 self.init()
679 self.read(pos - self.pos)
680
681 def tell(self):
682 return self.pos
683
684 def write(self, data):
685 self.pos += len(data)
686 raw = self.bz2obj.compress(data)
687 self.fileobj.write(raw)
688
689 def close(self):
690 if self.mode == "w":
691 raw = self.bz2obj.flush()
692 self.fileobj.write(raw)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000693# class _BZ2Proxy
694
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000695#------------------------
696# Extraction file object
697#------------------------
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000698class _FileInFile(object):
699 """A thin wrapper around an existing file object that
700 provides a part of its data as an individual file
701 object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000702 """
703
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000704 def __init__(self, fileobj, offset, size, blockinfo=None):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000705 self.fileobj = fileobj
706 self.offset = offset
707 self.size = size
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000708 self.position = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000709
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000710 if blockinfo is None:
711 blockinfo = [(0, size)]
712
713 # Construct a map with data and zero blocks.
714 self.map_index = 0
715 self.map = []
716 lastpos = 0
717 realpos = self.offset
718 for offset, size in blockinfo:
719 if offset > lastpos:
720 self.map.append((False, lastpos, offset, None))
721 self.map.append((True, offset, offset + size, realpos))
722 realpos += size
723 lastpos = offset + size
724 if lastpos < self.size:
725 self.map.append((False, lastpos, self.size, None))
726
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000727 def seekable(self):
728 if not hasattr(self.fileobj, "seekable"):
729 # XXX gzip.GzipFile and bz2.BZ2File
730 return True
731 return self.fileobj.seekable()
732
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000733 def tell(self):
734 """Return the current file position.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000735 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000736 return self.position
737
738 def seek(self, position):
739 """Seek to a position in the file.
740 """
741 self.position = position
742
743 def read(self, size=None):
744 """Read data from the file.
745 """
746 if size is None:
747 size = self.size - self.position
748 else:
749 size = min(size, self.size - self.position)
750
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000751 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000752 while size > 0:
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000753 while True:
754 data, start, stop, offset = self.map[self.map_index]
755 if start <= self.position < stop:
756 break
757 else:
758 self.map_index += 1
759 if self.map_index == len(self.map):
760 self.map_index = 0
761 length = min(size, stop - self.position)
762 if data:
763 self.fileobj.seek(offset)
764 block = self.fileobj.read(stop - start)
765 buf += block[self.position - start:self.position + length]
766 else:
767 buf += NUL * length
768 size -= length
769 self.position += length
770 return buf
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000771#class _FileInFile
772
773
774class ExFileObject(object):
775 """File-like object for reading an archive member.
776 Is returned by TarFile.extractfile().
777 """
778 blocksize = 1024
779
780 def __init__(self, tarfile, tarinfo):
781 self.fileobj = _FileInFile(tarfile.fileobj,
782 tarinfo.offset_data,
783 tarinfo.size,
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000784 tarinfo.sparse)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000785 self.name = tarinfo.name
786 self.mode = "r"
787 self.closed = False
788 self.size = tarinfo.size
789
790 self.position = 0
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000791 self.buffer = b""
792
793 def readable(self):
794 return True
795
796 def writable(self):
797 return False
798
799 def seekable(self):
800 return self.fileobj.seekable()
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000801
802 def read(self, size=None):
803 """Read at most size bytes from the file. If size is not
804 present or None, read all data until EOF is reached.
805 """
806 if self.closed:
807 raise ValueError("I/O operation on closed file")
808
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000809 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000810 if self.buffer:
811 if size is None:
812 buf = self.buffer
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000813 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000814 else:
815 buf = self.buffer[:size]
816 self.buffer = self.buffer[size:]
817
818 if size is None:
819 buf += self.fileobj.read()
820 else:
821 buf += self.fileobj.read(size - len(buf))
822
823 self.position += len(buf)
824 return buf
825
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000826 # XXX TextIOWrapper uses the read1() method.
827 read1 = read
828
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000829 def readline(self, size=-1):
830 """Read one entire line from the file. If size is present
831 and non-negative, return a string with at most that
832 size, which may be an incomplete line.
833 """
834 if self.closed:
835 raise ValueError("I/O operation on closed file")
836
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000837 pos = self.buffer.find(b"\n") + 1
838 if pos == 0:
839 # no newline found.
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000840 while True:
841 buf = self.fileobj.read(self.blocksize)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000842 self.buffer += buf
843 if not buf or b"\n" in buf:
844 pos = self.buffer.find(b"\n") + 1
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000845 if pos == 0:
846 # no newline found.
847 pos = len(self.buffer)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000848 break
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000849
850 if size != -1:
851 pos = min(size, pos)
852
853 buf = self.buffer[:pos]
854 self.buffer = self.buffer[pos:]
855 self.position += len(buf)
856 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000857
858 def readlines(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000859 """Return a list with all remaining lines.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000860 """
861 result = []
862 while True:
863 line = self.readline()
864 if not line: break
865 result.append(line)
866 return result
867
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000868 def tell(self):
869 """Return the current file position.
870 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000871 if self.closed:
872 raise ValueError("I/O operation on closed file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000873
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000874 return self.position
875
876 def seek(self, pos, whence=os.SEEK_SET):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000877 """Seek to a position in the file.
878 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000879 if self.closed:
880 raise ValueError("I/O operation on closed file")
881
882 if whence == os.SEEK_SET:
883 self.position = min(max(pos, 0), self.size)
884 elif whence == os.SEEK_CUR:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000885 if pos < 0:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000886 self.position = max(self.position + pos, 0)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000887 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000888 self.position = min(self.position + pos, self.size)
889 elif whence == os.SEEK_END:
890 self.position = max(min(self.size + pos, self.size), 0)
891 else:
892 raise ValueError("Invalid argument")
893
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000894 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000895 self.fileobj.seek(self.position)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000896
897 def close(self):
898 """Close the file object.
899 """
900 self.closed = True
Martin v. Löwisdf241532005-03-03 08:17:42 +0000901
902 def __iter__(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000903 """Get an iterator over the file's lines.
Martin v. Löwisdf241532005-03-03 08:17:42 +0000904 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000905 while True:
906 line = self.readline()
907 if not line:
908 break
909 yield line
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000910#class ExFileObject
911
912#------------------
913# Exported Classes
914#------------------
915class TarInfo(object):
916 """Informational class which holds the details about an
917 archive member given by a tar header block.
918 TarInfo objects are returned by TarFile.getmember(),
919 TarFile.getmembers() and TarFile.gettarinfo() and are
920 usually created internally.
921 """
922
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000923 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
924 "chksum", "type", "linkname", "uname", "gname",
925 "devmajor", "devminor",
926 "offset", "offset_data", "pax_headers", "sparse",
927 "tarfile", "_sparse_structs", "_link_target")
928
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000929 def __init__(self, name=""):
930 """Construct a TarInfo object. name is the optional name
931 of the member.
932 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000933 self.name = name # member name
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000934 self.mode = 0o644 # file permissions
Thomas Wouters477c8d52006-05-27 19:21:47 +0000935 self.uid = 0 # user id
936 self.gid = 0 # group id
937 self.size = 0 # file size
938 self.mtime = 0 # modification time
939 self.chksum = 0 # header checksum
940 self.type = REGTYPE # member type
941 self.linkname = "" # link name
Lars Gustäbel331b8002010-10-04 15:18:47 +0000942 self.uname = "" # user name
943 self.gname = "" # group name
Thomas Wouters477c8d52006-05-27 19:21:47 +0000944 self.devmajor = 0 # device major number
945 self.devminor = 0 # device minor number
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000946
Thomas Wouters477c8d52006-05-27 19:21:47 +0000947 self.offset = 0 # the tar header starts here
948 self.offset_data = 0 # the file's data starts here
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000949
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000950 self.sparse = None # sparse member information
Guido van Rossumd8faa362007-04-27 19:54:29 +0000951 self.pax_headers = {} # pax header information
952
953 # In pax headers the "name" and "linkname" field are called
954 # "path" and "linkpath".
955 def _getpath(self):
956 return self.name
957 def _setpath(self, name):
958 self.name = name
959 path = property(_getpath, _setpath)
960
961 def _getlinkpath(self):
962 return self.linkname
963 def _setlinkpath(self, linkname):
964 self.linkname = linkname
965 linkpath = property(_getlinkpath, _setlinkpath)
966
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000967 def __repr__(self):
968 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
969
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000970 def get_info(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000971 """Return the TarInfo's attributes as a dictionary.
972 """
973 info = {
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000974 "name": self.name,
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000975 "mode": self.mode & 0o7777,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000976 "uid": self.uid,
977 "gid": self.gid,
978 "size": self.size,
979 "mtime": self.mtime,
980 "chksum": self.chksum,
981 "type": self.type,
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000982 "linkname": self.linkname,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000983 "uname": self.uname,
984 "gname": self.gname,
985 "devmajor": self.devmajor,
986 "devminor": self.devminor
987 }
988
989 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
990 info["name"] += "/"
991
992 return info
993
Victor Stinnerde629d42010-05-05 21:43:57 +0000994 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000995 """Return a tar header as a string of 512 byte blocks.
996 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000997 info = self.get_info()
Guido van Rossume7ba4952007-06-06 23:52:48 +0000998
Guido van Rossumd8faa362007-04-27 19:54:29 +0000999 if format == USTAR_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001000 return self.create_ustar_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001001 elif format == GNU_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001002 return self.create_gnu_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001003 elif format == PAX_FORMAT:
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001004 return self.create_pax_header(info, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001005 else:
1006 raise ValueError("invalid format")
1007
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001008 def create_ustar_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001009 """Return the object as a ustar header block.
1010 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001011 info["magic"] = POSIX_MAGIC
1012
1013 if len(info["linkname"]) > LENGTH_LINK:
1014 raise ValueError("linkname is too long")
1015
1016 if len(info["name"]) > LENGTH_NAME:
1017 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1018
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001019 return self._create_header(info, USTAR_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001020
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001021 def create_gnu_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001022 """Return the object as a GNU header block sequence.
1023 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001024 info["magic"] = GNU_MAGIC
1025
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001026 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +00001027 if len(info["linkname"]) > LENGTH_LINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001028 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001029
1030 if len(info["name"]) > LENGTH_NAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001031 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001032
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001033 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001034
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001035 def create_pax_header(self, info, encoding):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001036 """Return the object as a ustar header block. If it cannot be
1037 represented this way, prepend a pax extended header sequence
1038 with supplement information.
1039 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001040 info["magic"] = POSIX_MAGIC
1041 pax_headers = self.pax_headers.copy()
1042
1043 # Test string fields for values that exceed the field length or cannot
1044 # be represented in ASCII encoding.
1045 for name, hname, length in (
1046 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
1047 ("uname", "uname", 32), ("gname", "gname", 32)):
1048
Guido van Rossume7ba4952007-06-06 23:52:48 +00001049 if hname in pax_headers:
1050 # The pax header has priority.
1051 continue
1052
Guido van Rossumd8faa362007-04-27 19:54:29 +00001053 # Try to encode the string as ASCII.
1054 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001055 info[name].encode("ascii", "strict")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001056 except UnicodeEncodeError:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001057 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001058 continue
1059
Guido van Rossume7ba4952007-06-06 23:52:48 +00001060 if len(info[name]) > length:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001061 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001062
1063 # Test number fields for values that exceed the field limit or values
1064 # that like to be stored as float.
1065 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001066 if name in pax_headers:
1067 # The pax header has priority. Avoid overflow.
1068 info[name] = 0
1069 continue
1070
Guido van Rossumd8faa362007-04-27 19:54:29 +00001071 val = info[name]
1072 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001073 pax_headers[name] = str(val)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001074 info[name] = 0
1075
Guido van Rossume7ba4952007-06-06 23:52:48 +00001076 # Create a pax extended header if necessary.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001077 if pax_headers:
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001078 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001079 else:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001080 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +00001081
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001082 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001083
1084 @classmethod
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001085 def create_pax_global_header(cls, pax_headers):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001086 """Return the object as a pax global header block sequence.
1087 """
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001088 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf8")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001089
1090 def _posix_split_name(self, name):
1091 """Split a name longer than 100 chars into a prefix
1092 and a name part.
1093 """
1094 prefix = name[:LENGTH_PREFIX + 1]
1095 while prefix and prefix[-1] != "/":
1096 prefix = prefix[:-1]
1097
1098 name = name[len(prefix):]
1099 prefix = prefix[:-1]
1100
1101 if not prefix or len(name) > LENGTH_NAME:
1102 raise ValueError("name is too long")
1103 return prefix, name
1104
1105 @staticmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001106 def _create_header(info, format, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001107 """Return a header block. info is a dictionary with file
1108 information, format must be one of the *_FORMAT constants.
1109 """
1110 parts = [
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001111 stn(info.get("name", ""), 100, encoding, errors),
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001112 itn(info.get("mode", 0) & 0o7777, 8, format),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001113 itn(info.get("uid", 0), 8, format),
1114 itn(info.get("gid", 0), 8, format),
1115 itn(info.get("size", 0), 12, format),
1116 itn(info.get("mtime", 0), 12, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001117 b" ", # checksum field
Guido van Rossumd8faa362007-04-27 19:54:29 +00001118 info.get("type", REGTYPE),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001119 stn(info.get("linkname", ""), 100, encoding, errors),
1120 info.get("magic", POSIX_MAGIC),
Lars Gustäbel331b8002010-10-04 15:18:47 +00001121 stn(info.get("uname", ""), 32, encoding, errors),
1122 stn(info.get("gname", ""), 32, encoding, errors),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001123 itn(info.get("devmajor", 0), 8, format),
1124 itn(info.get("devminor", 0), 8, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001125 stn(info.get("prefix", ""), 155, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001126 ]
1127
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001128 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001129 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
Lars Gustäbela280ca752007-08-28 07:34:33 +00001130 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001131 return buf
1132
1133 @staticmethod
1134 def _create_payload(payload):
1135 """Return the string payload filled with zero bytes
1136 up to the next 512 byte border.
1137 """
1138 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1139 if remainder > 0:
1140 payload += (BLOCKSIZE - remainder) * NUL
1141 return payload
1142
1143 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001144 def _create_gnu_long_header(cls, name, type, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001145 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1146 for name.
1147 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001148 name = name.encode(encoding, errors) + NUL
Guido van Rossumd8faa362007-04-27 19:54:29 +00001149
1150 info = {}
1151 info["name"] = "././@LongLink"
1152 info["type"] = type
1153 info["size"] = len(name)
1154 info["magic"] = GNU_MAGIC
1155
1156 # create extended header + name blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001157 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001158 cls._create_payload(name)
1159
1160 @classmethod
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001161 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1162 """Return a POSIX.1-2008 extended or global header sequence
Guido van Rossumd8faa362007-04-27 19:54:29 +00001163 that contains a list of keyword, value pairs. The values
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001164 must be strings.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001165 """
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001166 # Check if one of the fields contains surrogate characters and thereby
1167 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1168 binary = False
1169 for keyword, value in pax_headers.items():
1170 try:
1171 value.encode("utf8", "strict")
1172 except UnicodeEncodeError:
1173 binary = True
1174 break
1175
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001176 records = b""
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001177 if binary:
1178 # Put the hdrcharset field at the beginning of the header.
1179 records += b"21 hdrcharset=BINARY\n"
1180
Guido van Rossumd8faa362007-04-27 19:54:29 +00001181 for keyword, value in pax_headers.items():
1182 keyword = keyword.encode("utf8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001183 if binary:
1184 # Try to restore the original byte representation of `value'.
1185 # Needless to say, that the encoding must match the string.
1186 value = value.encode(encoding, "surrogateescape")
1187 else:
1188 value = value.encode("utf8")
1189
Guido van Rossumd8faa362007-04-27 19:54:29 +00001190 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1191 n = p = 0
1192 while True:
1193 n = l + len(str(p))
1194 if n == p:
1195 break
1196 p = n
Lars Gustäbela280ca752007-08-28 07:34:33 +00001197 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
Guido van Rossumd8faa362007-04-27 19:54:29 +00001198
1199 # We use a hardcoded "././@PaxHeader" name like star does
1200 # instead of the one that POSIX recommends.
1201 info = {}
1202 info["name"] = "././@PaxHeader"
1203 info["type"] = type
1204 info["size"] = len(records)
1205 info["magic"] = POSIX_MAGIC
1206
1207 # Create pax header + record blocks.
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001208 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001209 cls._create_payload(records)
1210
Guido van Rossum75b64e62005-01-16 00:16:11 +00001211 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001212 def frombuf(cls, buf, encoding, errors):
1213 """Construct a TarInfo object from a 512 byte bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001214 """
Lars Gustäbel9520a432009-11-22 18:48:49 +00001215 if len(buf) == 0:
1216 raise EmptyHeaderError("empty header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001217 if len(buf) != BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001218 raise TruncatedHeaderError("truncated header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001219 if buf.count(NUL) == BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001220 raise EOFHeaderError("end of file header")
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001221
1222 chksum = nti(buf[148:156])
1223 if chksum not in calc_chksums(buf):
Lars Gustäbel9520a432009-11-22 18:48:49 +00001224 raise InvalidHeaderError("bad checksum")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001225
Guido van Rossumd8faa362007-04-27 19:54:29 +00001226 obj = cls()
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001227 obj.name = nts(buf[0:100], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001228 obj.mode = nti(buf[100:108])
1229 obj.uid = nti(buf[108:116])
1230 obj.gid = nti(buf[116:124])
1231 obj.size = nti(buf[124:136])
1232 obj.mtime = nti(buf[136:148])
1233 obj.chksum = chksum
1234 obj.type = buf[156:157]
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001235 obj.linkname = nts(buf[157:257], encoding, errors)
1236 obj.uname = nts(buf[265:297], encoding, errors)
1237 obj.gname = nts(buf[297:329], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001238 obj.devmajor = nti(buf[329:337])
1239 obj.devminor = nti(buf[337:345])
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001240 prefix = nts(buf[345:500], encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001241
Guido van Rossumd8faa362007-04-27 19:54:29 +00001242 # Old V7 tar format represents a directory as a regular
1243 # file with a trailing slash.
1244 if obj.type == AREGTYPE and obj.name.endswith("/"):
1245 obj.type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001246
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001247 # The old GNU sparse format occupies some of the unused
1248 # space in the buffer for up to 4 sparse structures.
1249 # Save the them for later processing in _proc_sparse().
1250 if obj.type == GNUTYPE_SPARSE:
1251 pos = 386
1252 structs = []
1253 for i in range(4):
1254 try:
1255 offset = nti(buf[pos:pos + 12])
1256 numbytes = nti(buf[pos + 12:pos + 24])
1257 except ValueError:
1258 break
1259 structs.append((offset, numbytes))
1260 pos += 24
1261 isextended = bool(buf[482])
1262 origsize = nti(buf[483:495])
1263 obj._sparse_structs = (structs, isextended, origsize)
1264
Guido van Rossumd8faa362007-04-27 19:54:29 +00001265 # Remove redundant slashes from directories.
1266 if obj.isdir():
1267 obj.name = obj.name.rstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001268
Guido van Rossumd8faa362007-04-27 19:54:29 +00001269 # Reconstruct a ustar longname.
1270 if prefix and obj.type not in GNU_TYPES:
1271 obj.name = prefix + "/" + obj.name
1272 return obj
1273
1274 @classmethod
1275 def fromtarfile(cls, tarfile):
1276 """Return the next TarInfo object from TarFile object
1277 tarfile.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001278 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001279 buf = tarfile.fileobj.read(BLOCKSIZE)
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001280 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001281 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1282 return obj._proc_member(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001283
Guido van Rossumd8faa362007-04-27 19:54:29 +00001284 #--------------------------------------------------------------------------
1285 # The following are methods that are called depending on the type of a
1286 # member. The entry point is _proc_member() which can be overridden in a
1287 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1288 # implement the following
1289 # operations:
1290 # 1. Set self.offset_data to the position where the data blocks begin,
1291 # if there is data that follows.
1292 # 2. Set tarfile.offset to the position where the next member's header will
1293 # begin.
1294 # 3. Return self or another valid TarInfo object.
1295 def _proc_member(self, tarfile):
1296 """Choose the right processing method depending on
1297 the type and call it.
Thomas Wouters89f507f2006-12-13 04:49:30 +00001298 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001299 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1300 return self._proc_gnulong(tarfile)
1301 elif self.type == GNUTYPE_SPARSE:
1302 return self._proc_sparse(tarfile)
1303 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1304 return self._proc_pax(tarfile)
1305 else:
1306 return self._proc_builtin(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001307
Guido van Rossumd8faa362007-04-27 19:54:29 +00001308 def _proc_builtin(self, tarfile):
1309 """Process a builtin type or an unknown type which
1310 will be treated as a regular file.
1311 """
1312 self.offset_data = tarfile.fileobj.tell()
1313 offset = self.offset_data
1314 if self.isreg() or self.type not in SUPPORTED_TYPES:
1315 # Skip the following data blocks.
1316 offset += self._block(self.size)
1317 tarfile.offset = offset
Thomas Wouters89f507f2006-12-13 04:49:30 +00001318
Guido van Rossume7ba4952007-06-06 23:52:48 +00001319 # Patch the TarInfo object with saved global
Guido van Rossumd8faa362007-04-27 19:54:29 +00001320 # header information.
Guido van Rossume7ba4952007-06-06 23:52:48 +00001321 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001322
1323 return self
1324
1325 def _proc_gnulong(self, tarfile):
1326 """Process the blocks that hold a GNU longname
1327 or longlink member.
1328 """
1329 buf = tarfile.fileobj.read(self._block(self.size))
1330
1331 # Fetch the next header and process it.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001332 try:
1333 next = self.fromtarfile(tarfile)
1334 except HeaderError:
1335 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001336
1337 # Patch the TarInfo object from the next header with
1338 # the longname information.
1339 next.offset = self.offset
1340 if self.type == GNUTYPE_LONGNAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001341 next.name = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001342 elif self.type == GNUTYPE_LONGLINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001343 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001344
1345 return next
1346
1347 def _proc_sparse(self, tarfile):
1348 """Process a GNU sparse header plus extra headers.
1349 """
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001350 # We already collected some sparse structures in frombuf().
1351 structs, isextended, origsize = self._sparse_structs
1352 del self._sparse_structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001353
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001354 # Collect sparse structures from extended header blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001355 while isextended:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001356 buf = tarfile.fileobj.read(BLOCKSIZE)
1357 pos = 0
Guido van Rossum805365e2007-05-07 22:24:25 +00001358 for i in range(21):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001359 try:
1360 offset = nti(buf[pos:pos + 12])
1361 numbytes = nti(buf[pos + 12:pos + 24])
1362 except ValueError:
1363 break
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001364 if offset and numbytes:
1365 structs.append((offset, numbytes))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001366 pos += 24
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001367 isextended = bool(buf[504])
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001368 self.sparse = structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001369
1370 self.offset_data = tarfile.fileobj.tell()
1371 tarfile.offset = self.offset_data + self._block(self.size)
1372 self.size = origsize
Guido van Rossumd8faa362007-04-27 19:54:29 +00001373 return self
1374
1375 def _proc_pax(self, tarfile):
1376 """Process an extended or global header as described in
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001377 POSIX.1-2008.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001378 """
1379 # Read the header information.
1380 buf = tarfile.fileobj.read(self._block(self.size))
1381
1382 # A pax header stores supplemental information for either
1383 # the following file (extended) or all following files
1384 # (global).
1385 if self.type == XGLTYPE:
1386 pax_headers = tarfile.pax_headers
1387 else:
1388 pax_headers = tarfile.pax_headers.copy()
1389
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001390 # Check if the pax header contains a hdrcharset field. This tells us
1391 # the encoding of the path, linkpath, uname and gname fields. Normally,
1392 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1393 # implementations are allowed to store them as raw binary strings if
1394 # the translation to UTF-8 fails.
1395 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1396 if match is not None:
1397 pax_headers["hdrcharset"] = match.group(1).decode("utf8")
1398
1399 # For the time being, we don't care about anything other than "BINARY".
1400 # The only other value that is currently allowed by the standard is
1401 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1402 hdrcharset = pax_headers.get("hdrcharset")
1403 if hdrcharset == "BINARY":
1404 encoding = tarfile.encoding
1405 else:
1406 encoding = "utf8"
1407
Guido van Rossumd8faa362007-04-27 19:54:29 +00001408 # Parse pax header information. A record looks like that:
1409 # "%d %s=%s\n" % (length, keyword, value). length is the size
1410 # of the complete record including the length field itself and
Guido van Rossume7ba4952007-06-06 23:52:48 +00001411 # the newline. keyword and value are both UTF-8 encoded strings.
Antoine Pitroufd036452008-08-19 17:56:33 +00001412 regex = re.compile(br"(\d+) ([^=]+)=")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001413 pos = 0
1414 while True:
1415 match = regex.match(buf, pos)
1416 if not match:
1417 break
1418
1419 length, keyword = match.groups()
1420 length = int(length)
1421 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1422
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001423 # Normally, we could just use "utf8" as the encoding and "strict"
1424 # as the error handler, but we better not take the risk. For
1425 # example, GNU tar <= 1.23 is known to store filenames it cannot
1426 # translate to UTF-8 as raw strings (unfortunately without a
1427 # hdrcharset=BINARY header).
1428 # We first try the strict standard encoding, and if that fails we
1429 # fall back on the user's encoding and error handler.
1430 keyword = self._decode_pax_field(keyword, "utf8", "utf8",
1431 tarfile.errors)
1432 if keyword in PAX_NAME_FIELDS:
1433 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1434 tarfile.errors)
1435 else:
1436 value = self._decode_pax_field(value, "utf8", "utf8",
1437 tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001438
1439 pax_headers[keyword] = value
1440 pos += length
1441
Guido van Rossume7ba4952007-06-06 23:52:48 +00001442 # Fetch the next header.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001443 try:
1444 next = self.fromtarfile(tarfile)
1445 except HeaderError:
1446 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001447
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001448 # Process GNU sparse information.
1449 if "GNU.sparse.map" in pax_headers:
1450 # GNU extended sparse format version 0.1.
1451 self._proc_gnusparse_01(next, pax_headers)
1452
1453 elif "GNU.sparse.size" in pax_headers:
1454 # GNU extended sparse format version 0.0.
1455 self._proc_gnusparse_00(next, pax_headers, buf)
1456
1457 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1458 # GNU extended sparse format version 1.0.
1459 self._proc_gnusparse_10(next, pax_headers, tarfile)
1460
Guido van Rossume7ba4952007-06-06 23:52:48 +00001461 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001462 # Patch the TarInfo object with the extended header info.
1463 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1464 next.offset = self.offset
1465
1466 if "size" in pax_headers:
1467 # If the extended header replaces the size field,
1468 # we need to recalculate the offset where the next
1469 # header starts.
1470 offset = next.offset_data
1471 if next.isreg() or next.type not in SUPPORTED_TYPES:
1472 offset += next._block(next.size)
1473 tarfile.offset = offset
1474
1475 return next
1476
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001477 def _proc_gnusparse_00(self, next, pax_headers, buf):
1478 """Process a GNU tar extended sparse header, version 0.0.
1479 """
1480 offsets = []
1481 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1482 offsets.append(int(match.group(1)))
1483 numbytes = []
1484 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1485 numbytes.append(int(match.group(1)))
1486 next.sparse = list(zip(offsets, numbytes))
1487
1488 def _proc_gnusparse_01(self, next, pax_headers):
1489 """Process a GNU tar extended sparse header, version 0.1.
1490 """
1491 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1492 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1493
1494 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1495 """Process a GNU tar extended sparse header, version 1.0.
1496 """
1497 fields = None
1498 sparse = []
1499 buf = tarfile.fileobj.read(BLOCKSIZE)
1500 fields, buf = buf.split(b"\n", 1)
1501 fields = int(fields)
1502 while len(sparse) < fields * 2:
1503 if b"\n" not in buf:
1504 buf += tarfile.fileobj.read(BLOCKSIZE)
1505 number, buf = buf.split(b"\n", 1)
1506 sparse.append(int(number))
1507 next.offset_data = tarfile.fileobj.tell()
1508 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1509
Guido van Rossume7ba4952007-06-06 23:52:48 +00001510 def _apply_pax_info(self, pax_headers, encoding, errors):
1511 """Replace fields with supplemental information from a previous
1512 pax extended or global header.
1513 """
1514 for keyword, value in pax_headers.items():
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001515 if keyword == "GNU.sparse.name":
1516 setattr(self, "path", value)
1517 elif keyword == "GNU.sparse.size":
1518 setattr(self, "size", int(value))
1519 elif keyword == "GNU.sparse.realsize":
1520 setattr(self, "size", int(value))
1521 elif keyword in PAX_FIELDS:
1522 if keyword in PAX_NUMBER_FIELDS:
1523 try:
1524 value = PAX_NUMBER_FIELDS[keyword](value)
1525 except ValueError:
1526 value = 0
1527 if keyword == "path":
1528 value = value.rstrip("/")
1529 setattr(self, keyword, value)
Guido van Rossume7ba4952007-06-06 23:52:48 +00001530
1531 self.pax_headers = pax_headers.copy()
Guido van Rossumd8faa362007-04-27 19:54:29 +00001532
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001533 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1534 """Decode a single field from a pax record.
1535 """
1536 try:
1537 return value.decode(encoding, "strict")
1538 except UnicodeDecodeError:
1539 return value.decode(fallback_encoding, fallback_errors)
1540
Guido van Rossumd8faa362007-04-27 19:54:29 +00001541 def _block(self, count):
1542 """Round up a byte count by BLOCKSIZE and return it,
1543 e.g. _block(834) => 1024.
1544 """
1545 blocks, remainder = divmod(count, BLOCKSIZE)
1546 if remainder:
1547 blocks += 1
1548 return blocks * BLOCKSIZE
Thomas Wouters89f507f2006-12-13 04:49:30 +00001549
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001550 def isreg(self):
1551 return self.type in REGULAR_TYPES
1552 def isfile(self):
1553 return self.isreg()
1554 def isdir(self):
1555 return self.type == DIRTYPE
1556 def issym(self):
1557 return self.type == SYMTYPE
1558 def islnk(self):
1559 return self.type == LNKTYPE
1560 def ischr(self):
1561 return self.type == CHRTYPE
1562 def isblk(self):
1563 return self.type == BLKTYPE
1564 def isfifo(self):
1565 return self.type == FIFOTYPE
1566 def issparse(self):
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001567 return self.sparse is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001568 def isdev(self):
1569 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1570# class TarInfo
1571
1572class TarFile(object):
1573 """The TarFile Class provides an interface to tar archives.
1574 """
1575
1576 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1577
1578 dereference = False # If true, add content of linked file to the
1579 # tar file, else the link.
1580
1581 ignore_zeros = False # If true, skips empty or invalid blocks and
1582 # continues processing.
1583
Lars Gustäbel365aff32009-12-13 11:42:29 +00001584 errorlevel = 1 # If 0, fatal errors only appear in debug
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001585 # messages (if debug >= 0). If > 0, errors
1586 # are passed to the caller as exceptions.
1587
Guido van Rossumd8faa362007-04-27 19:54:29 +00001588 format = DEFAULT_FORMAT # The format to use when creating an archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001589
Guido van Rossume7ba4952007-06-06 23:52:48 +00001590 encoding = ENCODING # Encoding for 8-bit character strings.
1591
1592 errors = None # Error handler for unicode conversion.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001593
Guido van Rossumd8faa362007-04-27 19:54:29 +00001594 tarinfo = TarInfo # The default TarInfo class to use.
1595
1596 fileobject = ExFileObject # The default ExFileObject class to use.
1597
1598 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1599 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
Victor Stinnerde629d42010-05-05 21:43:57 +00001600 errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001601 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1602 read from an existing archive, 'a' to append data to an existing
1603 file or 'w' to create a new file overwriting an existing one. `mode'
1604 defaults to 'r'.
1605 If `fileobj' is given, it is used for reading or writing data. If it
1606 can be determined, `mode' is overridden by `fileobj's mode.
1607 `fileobj' is not closed, when TarFile is closed.
1608 """
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001609 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001610 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001611 self.mode = mode
1612 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001613
1614 if not fileobj:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001615 if self.mode == "a" and not os.path.exists(name):
Thomas Wouterscf297e42007-02-23 15:07:44 +00001616 # Create nonexistent files in append mode.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001617 self.mode = "w"
1618 self._mode = "wb"
Guido van Rossume7ba4952007-06-06 23:52:48 +00001619 fileobj = bltn_open(name, self._mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001620 self._extfileobj = False
1621 else:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001622 if name is None and hasattr(fileobj, "name"):
1623 name = fileobj.name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001624 if hasattr(fileobj, "mode"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001625 self._mode = fileobj.mode
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001626 self._extfileobj = True
Thomas Woutersed03b412007-08-28 21:37:11 +00001627 self.name = os.path.abspath(name) if name else None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001628 self.fileobj = fileobj
1629
Guido van Rossumd8faa362007-04-27 19:54:29 +00001630 # Init attributes.
1631 if format is not None:
1632 self.format = format
1633 if tarinfo is not None:
1634 self.tarinfo = tarinfo
1635 if dereference is not None:
1636 self.dereference = dereference
1637 if ignore_zeros is not None:
1638 self.ignore_zeros = ignore_zeros
1639 if encoding is not None:
1640 self.encoding = encoding
Victor Stinnerde629d42010-05-05 21:43:57 +00001641 self.errors = errors
Guido van Rossume7ba4952007-06-06 23:52:48 +00001642
1643 if pax_headers is not None and self.format == PAX_FORMAT:
1644 self.pax_headers = pax_headers
1645 else:
1646 self.pax_headers = {}
1647
Guido van Rossumd8faa362007-04-27 19:54:29 +00001648 if debug is not None:
1649 self.debug = debug
1650 if errorlevel is not None:
1651 self.errorlevel = errorlevel
1652
1653 # Init datastructures.
Thomas Wouters477c8d52006-05-27 19:21:47 +00001654 self.closed = False
1655 self.members = [] # list of members as TarInfo objects
1656 self._loaded = False # flag if all members have been read
Christian Heimesd8654cf2007-12-02 15:22:16 +00001657 self.offset = self.fileobj.tell()
1658 # current position in the archive file
Thomas Wouters477c8d52006-05-27 19:21:47 +00001659 self.inodes = {} # dictionary caching the inodes of
1660 # archive members already added
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001661
Lars Gustäbel7b465392009-11-18 20:29:25 +00001662 try:
1663 if self.mode == "r":
1664 self.firstmember = None
1665 self.firstmember = self.next()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001666
Lars Gustäbel7b465392009-11-18 20:29:25 +00001667 if self.mode == "a":
1668 # Move to the end of the archive,
1669 # before the first empty block.
Lars Gustäbel7b465392009-11-18 20:29:25 +00001670 while True:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001671 self.fileobj.seek(self.offset)
1672 try:
1673 tarinfo = self.tarinfo.fromtarfile(self)
1674 self.members.append(tarinfo)
1675 except EOFHeaderError:
1676 self.fileobj.seek(self.offset)
Lars Gustäbel7b465392009-11-18 20:29:25 +00001677 break
Lars Gustäbel9520a432009-11-22 18:48:49 +00001678 except HeaderError as e:
1679 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001680
Lars Gustäbel7b465392009-11-18 20:29:25 +00001681 if self.mode in "aw":
1682 self._loaded = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001683
Lars Gustäbel7b465392009-11-18 20:29:25 +00001684 if self.pax_headers:
1685 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1686 self.fileobj.write(buf)
1687 self.offset += len(buf)
1688 except:
1689 if not self._extfileobj:
1690 self.fileobj.close()
1691 self.closed = True
1692 raise
Guido van Rossumd8faa362007-04-27 19:54:29 +00001693
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001694 #--------------------------------------------------------------------------
1695 # Below are the classmethods which act as alternate constructors to the
1696 # TarFile class. The open() method is the only one that is needed for
1697 # public use; it is the "super"-constructor and is able to select an
1698 # adequate "sub"-constructor for a particular compression using the mapping
1699 # from OPEN_METH.
1700 #
1701 # This concept allows one to subclass TarFile without losing the comfort of
1702 # the super-constructor. A sub-constructor is registered and made available
1703 # by adding it to the mapping in OPEN_METH.
1704
Guido van Rossum75b64e62005-01-16 00:16:11 +00001705 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001706 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001707 """Open a tar archive for reading, writing or appending. Return
1708 an appropriate TarFile class.
1709
1710 mode:
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001711 'r' or 'r:*' open for reading with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001712 'r:' open for reading exclusively uncompressed
1713 'r:gz' open for reading with gzip compression
1714 'r:bz2' open for reading with bzip2 compression
Thomas Wouterscf297e42007-02-23 15:07:44 +00001715 'a' or 'a:' open for appending, creating the file if necessary
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001716 'w' or 'w:' open for writing without compression
1717 'w:gz' open for writing with gzip compression
1718 'w:bz2' open for writing with bzip2 compression
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001719
1720 'r|*' open a stream of tar blocks with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001721 'r|' open an uncompressed stream of tar blocks for reading
1722 'r|gz' open a gzip compressed stream of tar blocks
1723 'r|bz2' open a bzip2 compressed stream of tar blocks
1724 'w|' open an uncompressed stream for writing
1725 'w|gz' open a gzip compressed stream for writing
1726 'w|bz2' open a bzip2 compressed stream for writing
1727 """
1728
1729 if not name and not fileobj:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001730 raise ValueError("nothing to open")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001731
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001732 if mode in ("r", "r:*"):
1733 # Find out which *open() is appropriate for opening the file.
1734 for comptype in cls.OPEN_METH:
1735 func = getattr(cls, cls.OPEN_METH[comptype])
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001736 if fileobj is not None:
1737 saved_pos = fileobj.tell()
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001738 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001739 return func(name, "r", fileobj, **kwargs)
1740 except (ReadError, CompressionError) as e:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001741 if fileobj is not None:
1742 fileobj.seek(saved_pos)
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001743 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001744 raise ReadError("file could not be opened successfully")
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001745
1746 elif ":" in mode:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001747 filemode, comptype = mode.split(":", 1)
1748 filemode = filemode or "r"
1749 comptype = comptype or "tar"
1750
1751 # Select the *open() function according to
1752 # given compression.
1753 if comptype in cls.OPEN_METH:
1754 func = getattr(cls, cls.OPEN_METH[comptype])
1755 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001756 raise CompressionError("unknown compression type %r" % comptype)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001757 return func(name, filemode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001758
1759 elif "|" in mode:
1760 filemode, comptype = mode.split("|", 1)
1761 filemode = filemode or "r"
1762 comptype = comptype or "tar"
1763
1764 if filemode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001765 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001766
Antoine Pitrou605c2932010-09-23 20:15:14 +00001767 stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1768 try:
1769 t = cls(name, filemode, stream, **kwargs)
1770 except:
1771 stream.close()
1772 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001773 t._extfileobj = False
1774 return t
1775
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001776 elif mode in "aw":
Guido van Rossumd8faa362007-04-27 19:54:29 +00001777 return cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001778
Thomas Wouters477c8d52006-05-27 19:21:47 +00001779 raise ValueError("undiscernible mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001780
Guido van Rossum75b64e62005-01-16 00:16:11 +00001781 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001782 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001783 """Open uncompressed tar archive name for reading or writing.
1784 """
1785 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001786 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001787 return cls(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001788
Guido van Rossum75b64e62005-01-16 00:16:11 +00001789 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001790 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001791 """Open gzip compressed tar archive name for reading or writing.
1792 Appending is not allowed.
1793 """
1794 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001795 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001796
1797 try:
1798 import gzip
Neal Norwitz4ec68242003-04-11 03:05:56 +00001799 gzip.GzipFile
1800 except (ImportError, AttributeError):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001801 raise CompressionError("gzip module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001802
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001803 extfileobj = fileobj is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001804 try:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001805 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
1806 t = cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001807 except IOError:
Antoine Pitrou95f55602010-09-23 18:36:46 +00001808 if not extfileobj:
1809 fileobj.close()
Thomas Wouters477c8d52006-05-27 19:21:47 +00001810 raise ReadError("not a gzip file")
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001811 except:
1812 if not extfileobj:
1813 fileobj.close()
1814 raise
Antoine Pitrou95f55602010-09-23 18:36:46 +00001815 t._extfileobj = extfileobj
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001816 return t
1817
Guido van Rossum75b64e62005-01-16 00:16:11 +00001818 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001819 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001820 """Open bzip2 compressed tar archive name for reading or writing.
1821 Appending is not allowed.
1822 """
1823 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001824 raise ValueError("mode must be 'r' or 'w'.")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001825
1826 try:
1827 import bz2
1828 except ImportError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001829 raise CompressionError("bz2 module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001830
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001831 if fileobj is not None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001832 fileobj = _BZ2Proxy(fileobj, mode)
1833 else:
1834 fileobj = bz2.BZ2File(name, mode, compresslevel=compresslevel)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001835
1836 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001837 t = cls.taropen(name, mode, fileobj, **kwargs)
Lars Gustäbel9520a432009-11-22 18:48:49 +00001838 except (IOError, EOFError):
Antoine Pitrou95f55602010-09-23 18:36:46 +00001839 fileobj.close()
Thomas Wouters477c8d52006-05-27 19:21:47 +00001840 raise ReadError("not a bzip2 file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001841 t._extfileobj = False
1842 return t
1843
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001844 # All *open() methods are registered here.
1845 OPEN_METH = {
1846 "tar": "taropen", # uncompressed tar
1847 "gz": "gzopen", # gzip compressed tar
1848 "bz2": "bz2open" # bzip2 compressed tar
1849 }
1850
1851 #--------------------------------------------------------------------------
1852 # The public methods which TarFile provides:
1853
1854 def close(self):
1855 """Close the TarFile. In write-mode, two finishing zero blocks are
1856 appended to the archive.
1857 """
1858 if self.closed:
1859 return
1860
Guido van Rossumd8faa362007-04-27 19:54:29 +00001861 if self.mode in "aw":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001862 self.fileobj.write(NUL * (BLOCKSIZE * 2))
1863 self.offset += (BLOCKSIZE * 2)
1864 # fill up the end with zero-blocks
1865 # (like option -b20 for tar does)
1866 blocks, remainder = divmod(self.offset, RECORDSIZE)
1867 if remainder > 0:
1868 self.fileobj.write(NUL * (RECORDSIZE - remainder))
1869
1870 if not self._extfileobj:
1871 self.fileobj.close()
1872 self.closed = True
1873
1874 def getmember(self, name):
1875 """Return a TarInfo object for member `name'. If `name' can not be
1876 found in the archive, KeyError is raised. If a member occurs more
Mark Dickinson934896d2009-02-21 20:59:32 +00001877 than once in the archive, its last occurrence is assumed to be the
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001878 most up-to-date version.
1879 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001880 tarinfo = self._getmember(name)
1881 if tarinfo is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001882 raise KeyError("filename %r not found" % name)
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001883 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001884
1885 def getmembers(self):
1886 """Return the members of the archive as a list of TarInfo objects. The
1887 list has the same order as the members in the archive.
1888 """
1889 self._check()
1890 if not self._loaded: # if we want to obtain a list of
1891 self._load() # all members, we first have to
1892 # scan the whole archive.
1893 return self.members
1894
1895 def getnames(self):
1896 """Return the members of the archive as a list of their names. It has
1897 the same order as the list returned by getmembers().
1898 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001899 return [tarinfo.name for tarinfo in self.getmembers()]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001900
1901 def gettarinfo(self, name=None, arcname=None, fileobj=None):
1902 """Create a TarInfo object for either the file `name' or the file
1903 object `fileobj' (using os.fstat on its file descriptor). You can
1904 modify some of the TarInfo's attributes before you add it using
1905 addfile(). If given, `arcname' specifies an alternative name for the
1906 file in the archive.
1907 """
1908 self._check("aw")
1909
1910 # When fileobj is given, replace name by
1911 # fileobj's real name.
1912 if fileobj is not None:
1913 name = fileobj.name
1914
1915 # Building the name of the member in the archive.
1916 # Backward slashes are converted to forward slashes,
1917 # Absolute paths are turned to relative paths.
1918 if arcname is None:
1919 arcname = name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001920 drv, arcname = os.path.splitdrive(arcname)
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00001921 arcname = arcname.replace(os.sep, "/")
1922 arcname = arcname.lstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001923
1924 # Now, fill the TarInfo object with
1925 # information specific for the file.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001926 tarinfo = self.tarinfo()
1927 tarinfo.tarfile = self
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001928
1929 # Use os.stat or os.lstat, depending on platform
1930 # and if symlinks shall be resolved.
1931 if fileobj is None:
1932 if hasattr(os, "lstat") and not self.dereference:
1933 statres = os.lstat(name)
1934 else:
1935 statres = os.stat(name)
1936 else:
1937 statres = os.fstat(fileobj.fileno())
1938 linkname = ""
1939
1940 stmd = statres.st_mode
1941 if stat.S_ISREG(stmd):
1942 inode = (statres.st_ino, statres.st_dev)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001943 if not self.dereference and statres.st_nlink > 1 and \
1944 inode in self.inodes and arcname != self.inodes[inode]:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001945 # Is it a hardlink to an already
1946 # archived file?
1947 type = LNKTYPE
1948 linkname = self.inodes[inode]
1949 else:
1950 # The inode is added only if its valid.
1951 # For win32 it is always 0.
1952 type = REGTYPE
1953 if inode[0]:
1954 self.inodes[inode] = arcname
1955 elif stat.S_ISDIR(stmd):
1956 type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001957 elif stat.S_ISFIFO(stmd):
1958 type = FIFOTYPE
1959 elif stat.S_ISLNK(stmd):
1960 type = SYMTYPE
1961 linkname = os.readlink(name)
1962 elif stat.S_ISCHR(stmd):
1963 type = CHRTYPE
1964 elif stat.S_ISBLK(stmd):
1965 type = BLKTYPE
1966 else:
1967 return None
1968
1969 # Fill the TarInfo object with all
1970 # information we can get.
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001971 tarinfo.name = arcname
1972 tarinfo.mode = stmd
1973 tarinfo.uid = statres.st_uid
1974 tarinfo.gid = statres.st_gid
Lars Gustäbel2470ff12010-06-03 10:11:52 +00001975 if type == REGTYPE:
Martin v. Löwis61d77e02004-08-20 06:35:46 +00001976 tarinfo.size = statres.st_size
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001977 else:
Guido van Rossume2a383d2007-01-15 16:59:06 +00001978 tarinfo.size = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001979 tarinfo.mtime = statres.st_mtime
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001980 tarinfo.type = type
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001981 tarinfo.linkname = linkname
1982 if pwd:
1983 try:
1984 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1985 except KeyError:
1986 pass
1987 if grp:
1988 try:
1989 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1990 except KeyError:
1991 pass
1992
1993 if type in (CHRTYPE, BLKTYPE):
1994 if hasattr(os, "major") and hasattr(os, "minor"):
1995 tarinfo.devmajor = os.major(statres.st_rdev)
1996 tarinfo.devminor = os.minor(statres.st_rdev)
1997 return tarinfo
1998
1999 def list(self, verbose=True):
2000 """Print a table of contents to sys.stdout. If `verbose' is False, only
2001 the names of the members are printed. If it is True, an `ls -l'-like
2002 output is produced.
2003 """
2004 self._check()
2005
2006 for tarinfo in self:
2007 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002008 print(filemode(tarinfo.mode), end=' ')
2009 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2010 tarinfo.gname or tarinfo.gid), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002011 if tarinfo.ischr() or tarinfo.isblk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002012 print("%10s" % ("%d,%d" \
2013 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002014 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002015 print("%10d" % tarinfo.size, end=' ')
2016 print("%d-%02d-%02d %02d:%02d:%02d" \
2017 % time.localtime(tarinfo.mtime)[:6], end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002018
Guido van Rossumd8faa362007-04-27 19:54:29 +00002019 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002020
2021 if verbose:
2022 if tarinfo.issym():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002023 print("->", tarinfo.linkname, end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002024 if tarinfo.islnk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002025 print("link to", tarinfo.linkname, end=' ')
2026 print()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002027
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002028 def add(self, name, arcname=None, recursive=True, exclude=None, filter=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002029 """Add the file `name' to the archive. `name' may be any type of file
2030 (directory, fifo, symbolic link, etc.). If given, `arcname'
2031 specifies an alternative name for the file in the archive.
2032 Directories are added recursively by default. This can be avoided by
Guido van Rossum486364b2007-06-30 05:01:58 +00002033 setting `recursive' to False. `exclude' is a function that should
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002034 return True for each filename to be excluded. `filter' is a function
2035 that expects a TarInfo object argument and returns the changed
2036 TarInfo object, if it returns None the TarInfo object will be
2037 excluded from the archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002038 """
2039 self._check("aw")
2040
2041 if arcname is None:
2042 arcname = name
2043
Guido van Rossum486364b2007-06-30 05:01:58 +00002044 # Exclude pathnames.
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002045 if exclude is not None:
2046 import warnings
2047 warnings.warn("use the filter argument instead",
2048 DeprecationWarning, 2)
2049 if exclude(name):
2050 self._dbg(2, "tarfile: Excluded %r" % name)
2051 return
Guido van Rossum486364b2007-06-30 05:01:58 +00002052
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002053 # Skip if somebody tries to archive the archive...
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002054 if self.name is not None and os.path.abspath(name) == self.name:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002055 self._dbg(2, "tarfile: Skipped %r" % name)
2056 return
2057
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002058 self._dbg(1, name)
2059
2060 # Create a TarInfo object from the file.
2061 tarinfo = self.gettarinfo(name, arcname)
2062
2063 if tarinfo is None:
2064 self._dbg(1, "tarfile: Unsupported type %r" % name)
2065 return
2066
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002067 # Change or exclude the TarInfo object.
2068 if filter is not None:
2069 tarinfo = filter(tarinfo)
2070 if tarinfo is None:
2071 self._dbg(2, "tarfile: Excluded %r" % name)
2072 return
2073
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002074 # Append the tar header and data to the archive.
2075 if tarinfo.isreg():
Guido van Rossume7ba4952007-06-06 23:52:48 +00002076 f = bltn_open(name, "rb")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002077 self.addfile(tarinfo, f)
2078 f.close()
2079
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002080 elif tarinfo.isdir():
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002081 self.addfile(tarinfo)
2082 if recursive:
2083 for f in os.listdir(name):
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002084 self.add(os.path.join(name, f), os.path.join(arcname, f),
2085 recursive, exclude, filter)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002086
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002087 else:
2088 self.addfile(tarinfo)
2089
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002090 def addfile(self, tarinfo, fileobj=None):
2091 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2092 given, tarinfo.size bytes are read from it and added to the archive.
2093 You can create TarInfo objects using gettarinfo().
2094 On Windows platforms, `fileobj' should always be opened with mode
2095 'rb' to avoid irritation about the file size.
2096 """
2097 self._check("aw")
2098
Thomas Wouters89f507f2006-12-13 04:49:30 +00002099 tarinfo = copy.copy(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002100
Guido van Rossume7ba4952007-06-06 23:52:48 +00002101 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002102 self.fileobj.write(buf)
2103 self.offset += len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002104
2105 # If there's data to follow, append it.
2106 if fileobj is not None:
2107 copyfileobj(fileobj, self.fileobj, tarinfo.size)
2108 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
2109 if remainder > 0:
2110 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2111 blocks += 1
2112 self.offset += blocks * BLOCKSIZE
2113
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002114 self.members.append(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002115
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002116 def extractall(self, path=".", members=None):
2117 """Extract all members from the archive to the current working
2118 directory and set owner, modification time and permissions on
2119 directories afterwards. `path' specifies a different directory
2120 to extract to. `members' is optional and must be a subset of the
2121 list returned by getmembers().
2122 """
2123 directories = []
2124
2125 if members is None:
2126 members = self
2127
2128 for tarinfo in members:
2129 if tarinfo.isdir():
Christian Heimes2202f872008-02-06 14:31:34 +00002130 # Extract directories with a safe mode.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002131 directories.append(tarinfo)
Christian Heimes2202f872008-02-06 14:31:34 +00002132 tarinfo = copy.copy(tarinfo)
2133 tarinfo.mode = 0o700
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002134 # Do not set_attrs directories, as we will do that further down
2135 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002136
2137 # Reverse sort directories.
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +00002138 directories.sort(key=lambda a: a.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002139 directories.reverse()
2140
2141 # Set correct owner, mtime and filemode on directories.
2142 for tarinfo in directories:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002143 dirpath = os.path.join(path, tarinfo.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002144 try:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002145 self.chown(tarinfo, dirpath)
2146 self.utime(tarinfo, dirpath)
2147 self.chmod(tarinfo, dirpath)
Guido van Rossumb940e112007-01-10 16:19:56 +00002148 except ExtractError as e:
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002149 if self.errorlevel > 1:
2150 raise
2151 else:
2152 self._dbg(1, "tarfile: %s" % e)
2153
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002154 def extract(self, member, path="", set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002155 """Extract a member from the archive to the current working directory,
2156 using its full name. Its file information is extracted as accurately
2157 as possible. `member' may be a filename or a TarInfo object. You can
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002158 specify a different directory using `path'. File attributes (owner,
2159 mtime, mode) are set unless `set_attrs' is False.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002160 """
2161 self._check("r")
2162
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002163 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002164 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002165 else:
2166 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002167
Neal Norwitza4f651a2004-07-20 22:07:44 +00002168 # Prepare the link target for makelink().
2169 if tarinfo.islnk():
2170 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2171
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002172 try:
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002173 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2174 set_attrs=set_attrs)
Guido van Rossumb940e112007-01-10 16:19:56 +00002175 except EnvironmentError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002176 if self.errorlevel > 0:
2177 raise
2178 else:
2179 if e.filename is None:
2180 self._dbg(1, "tarfile: %s" % e.strerror)
2181 else:
2182 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
Guido van Rossumb940e112007-01-10 16:19:56 +00002183 except ExtractError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002184 if self.errorlevel > 1:
2185 raise
2186 else:
2187 self._dbg(1, "tarfile: %s" % e)
2188
2189 def extractfile(self, member):
2190 """Extract a member from the archive as a file object. `member' may be
2191 a filename or a TarInfo object. If `member' is a regular file, a
2192 file-like object is returned. If `member' is a link, a file-like
2193 object is constructed from the link's target. If `member' is none of
2194 the above, None is returned.
2195 The file-like object is read-only and provides the following
2196 methods: read(), readline(), readlines(), seek() and tell()
2197 """
2198 self._check("r")
2199
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002200 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002201 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002202 else:
2203 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002204
2205 if tarinfo.isreg():
2206 return self.fileobject(self, tarinfo)
2207
2208 elif tarinfo.type not in SUPPORTED_TYPES:
2209 # If a member's type is unknown, it is treated as a
2210 # regular file.
2211 return self.fileobject(self, tarinfo)
2212
2213 elif tarinfo.islnk() or tarinfo.issym():
2214 if isinstance(self.fileobj, _Stream):
2215 # A small but ugly workaround for the case that someone tries
2216 # to extract a (sym)link as a file-object from a non-seekable
2217 # stream of tar blocks.
Thomas Wouters477c8d52006-05-27 19:21:47 +00002218 raise StreamError("cannot extract (sym)link as file object")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002219 else:
Georg Brandl7eb4b7d2005-07-22 21:49:32 +00002220 # A (sym)link's file object is its target's file object.
Lars Gustäbel1b512722010-06-03 12:45:16 +00002221 return self.extractfile(self._find_link_target(tarinfo))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002222 else:
2223 # If there's no data associated with the member (directory, chrdev,
2224 # blkdev, etc.), return None instead of a file object.
2225 return None
2226
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002227 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002228 """Extract the TarInfo object tarinfo to a physical
2229 file called targetpath.
2230 """
2231 # Fetch the TarInfo object for the given name
2232 # and build the destination pathname, replacing
2233 # forward slashes to platform specific separators.
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002234 targetpath = targetpath.rstrip("/")
2235 targetpath = targetpath.replace("/", os.sep)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002236
2237 # Create all upper directories.
2238 upperdirs = os.path.dirname(targetpath)
2239 if upperdirs and not os.path.exists(upperdirs):
Christian Heimes2202f872008-02-06 14:31:34 +00002240 # Create directories that are not part of the archive with
2241 # default permissions.
Thomas Woutersb2137042007-02-01 18:02:27 +00002242 os.makedirs(upperdirs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002243
2244 if tarinfo.islnk() or tarinfo.issym():
2245 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2246 else:
2247 self._dbg(1, tarinfo.name)
2248
2249 if tarinfo.isreg():
2250 self.makefile(tarinfo, targetpath)
2251 elif tarinfo.isdir():
2252 self.makedir(tarinfo, targetpath)
2253 elif tarinfo.isfifo():
2254 self.makefifo(tarinfo, targetpath)
2255 elif tarinfo.ischr() or tarinfo.isblk():
2256 self.makedev(tarinfo, targetpath)
2257 elif tarinfo.islnk() or tarinfo.issym():
2258 self.makelink(tarinfo, targetpath)
2259 elif tarinfo.type not in SUPPORTED_TYPES:
2260 self.makeunknown(tarinfo, targetpath)
2261 else:
2262 self.makefile(tarinfo, targetpath)
2263
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002264 if set_attrs:
2265 self.chown(tarinfo, targetpath)
2266 if not tarinfo.issym():
2267 self.chmod(tarinfo, targetpath)
2268 self.utime(tarinfo, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002269
2270 #--------------------------------------------------------------------------
2271 # Below are the different file methods. They are called via
2272 # _extract_member() when extract() is called. They can be replaced in a
2273 # subclass to implement other functionality.
2274
2275 def makedir(self, tarinfo, targetpath):
2276 """Make a directory called targetpath.
2277 """
2278 try:
Christian Heimes2202f872008-02-06 14:31:34 +00002279 # Use a safe mode for the directory, the real mode is set
2280 # later in _extract_member().
2281 os.mkdir(targetpath, 0o700)
Guido van Rossumb940e112007-01-10 16:19:56 +00002282 except EnvironmentError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002283 if e.errno != errno.EEXIST:
2284 raise
2285
2286 def makefile(self, tarinfo, targetpath):
2287 """Make a file called targetpath.
2288 """
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002289 source = self.fileobj
2290 source.seek(tarinfo.offset_data)
Guido van Rossume7ba4952007-06-06 23:52:48 +00002291 target = bltn_open(targetpath, "wb")
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002292 if tarinfo.sparse is not None:
2293 for offset, size in tarinfo.sparse:
2294 target.seek(offset)
2295 copyfileobj(source, target, size)
2296 else:
2297 copyfileobj(source, target, tarinfo.size)
2298 target.seek(tarinfo.size)
2299 target.truncate()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002300 target.close()
2301
2302 def makeunknown(self, tarinfo, targetpath):
2303 """Make a file from a TarInfo object with an unknown type
2304 at targetpath.
2305 """
2306 self.makefile(tarinfo, targetpath)
2307 self._dbg(1, "tarfile: Unknown file type %r, " \
2308 "extracted as regular file." % tarinfo.type)
2309
2310 def makefifo(self, tarinfo, targetpath):
2311 """Make a fifo called targetpath.
2312 """
2313 if hasattr(os, "mkfifo"):
2314 os.mkfifo(targetpath)
2315 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002316 raise ExtractError("fifo not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002317
2318 def makedev(self, tarinfo, targetpath):
2319 """Make a character or block device called targetpath.
2320 """
2321 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00002322 raise ExtractError("special devices not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002323
2324 mode = tarinfo.mode
2325 if tarinfo.isblk():
2326 mode |= stat.S_IFBLK
2327 else:
2328 mode |= stat.S_IFCHR
2329
2330 os.mknod(targetpath, mode,
2331 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2332
2333 def makelink(self, tarinfo, targetpath):
2334 """Make a (symbolic) link called targetpath. If it cannot be created
2335 (platform limitation), we try to make a copy of the referenced file
2336 instead of a link.
2337 """
Brian Curtind40e6f72010-07-08 21:39:08 +00002338 try:
Lars Gustäbel1b512722010-06-03 12:45:16 +00002339 # For systems that support symbolic and hard links.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002340 if tarinfo.issym():
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002341 os.symlink(tarinfo.linkname, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002342 else:
Neal Norwitza4f651a2004-07-20 22:07:44 +00002343 # See extract().
Lars Gustäbel1b512722010-06-03 12:45:16 +00002344 if os.path.exists(tarinfo._link_target):
2345 os.link(tarinfo._link_target, targetpath)
2346 else:
Brian Curtin82df53e2010-09-24 21:04:05 +00002347 self._extract_member(self._find_link_target(tarinfo),
2348 targetpath)
Brian Curtin16633fa2010-07-09 13:54:27 +00002349 except symlink_exception:
Brian Curtind40e6f72010-07-08 21:39:08 +00002350 if tarinfo.issym():
Brian Curtin16633fa2010-07-09 13:54:27 +00002351 linkpath = os.path.join(os.path.dirname(tarinfo.name),
2352 tarinfo.linkname)
Brian Curtind40e6f72010-07-08 21:39:08 +00002353 else:
2354 linkpath = tarinfo.linkname
Lars Gustäbel1b512722010-06-03 12:45:16 +00002355 else:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002356 try:
Brian Curtin16633fa2010-07-09 13:54:27 +00002357 self._extract_member(self._find_link_target(tarinfo),
2358 targetpath)
Lars Gustäbel1b512722010-06-03 12:45:16 +00002359 except KeyError:
2360 raise ExtractError("unable to resolve link inside archive")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002361
2362 def chown(self, tarinfo, targetpath):
2363 """Set owner of targetpath according to tarinfo.
2364 """
2365 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2366 # We have to be root to do so.
2367 try:
2368 g = grp.getgrnam(tarinfo.gname)[2]
2369 except KeyError:
2370 try:
2371 g = grp.getgrgid(tarinfo.gid)[2]
2372 except KeyError:
2373 g = os.getgid()
2374 try:
2375 u = pwd.getpwnam(tarinfo.uname)[2]
2376 except KeyError:
2377 try:
2378 u = pwd.getpwuid(tarinfo.uid)[2]
2379 except KeyError:
2380 u = os.getuid()
2381 try:
2382 if tarinfo.issym() and hasattr(os, "lchown"):
2383 os.lchown(targetpath, u, g)
2384 else:
Andrew MacIntyre7970d202003-02-19 12:51:34 +00002385 if sys.platform != "os2emx":
2386 os.chown(targetpath, u, g)
Guido van Rossumb940e112007-01-10 16:19:56 +00002387 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002388 raise ExtractError("could not change owner")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002389
2390 def chmod(self, tarinfo, targetpath):
2391 """Set file permissions of targetpath according to tarinfo.
2392 """
Jack Jansen834eff62003-03-07 12:47:06 +00002393 if hasattr(os, 'chmod'):
2394 try:
2395 os.chmod(targetpath, tarinfo.mode)
Guido van Rossumb940e112007-01-10 16:19:56 +00002396 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002397 raise ExtractError("could not change mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002398
2399 def utime(self, tarinfo, targetpath):
2400 """Set modification time of targetpath according to tarinfo.
2401 """
Jack Jansen834eff62003-03-07 12:47:06 +00002402 if not hasattr(os, 'utime'):
Tim Petersf9347782003-03-07 15:36:41 +00002403 return
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002404 try:
2405 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
Guido van Rossumb940e112007-01-10 16:19:56 +00002406 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002407 raise ExtractError("could not change modification time")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002408
2409 #--------------------------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002410 def next(self):
2411 """Return the next member of the archive as a TarInfo object, when
2412 TarFile is opened for reading. Return None if there is no more
2413 available.
2414 """
2415 self._check("ra")
2416 if self.firstmember is not None:
2417 m = self.firstmember
2418 self.firstmember = None
2419 return m
2420
2421 # Read the next block.
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002422 self.fileobj.seek(self.offset)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002423 tarinfo = None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002424 while True:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002425 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00002426 tarinfo = self.tarinfo.fromtarfile(self)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002427 except EOFHeaderError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002428 if self.ignore_zeros:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002429 self._dbg(2, "0x%X: %s" % (self.offset, e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002430 self.offset += BLOCKSIZE
2431 continue
Lars Gustäbel9520a432009-11-22 18:48:49 +00002432 except InvalidHeaderError as e:
2433 if self.ignore_zeros:
2434 self._dbg(2, "0x%X: %s" % (self.offset, e))
2435 self.offset += BLOCKSIZE
2436 continue
2437 elif self.offset == 0:
2438 raise ReadError(str(e))
2439 except EmptyHeaderError:
2440 if self.offset == 0:
2441 raise ReadError("empty file")
2442 except TruncatedHeaderError as e:
2443 if self.offset == 0:
2444 raise ReadError(str(e))
2445 except SubsequentHeaderError as e:
2446 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002447 break
2448
Lars Gustäbel9520a432009-11-22 18:48:49 +00002449 if tarinfo is not None:
2450 self.members.append(tarinfo)
2451 else:
2452 self._loaded = True
2453
Thomas Wouters477c8d52006-05-27 19:21:47 +00002454 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002455
2456 #--------------------------------------------------------------------------
2457 # Little helper methods:
2458
Lars Gustäbel1b512722010-06-03 12:45:16 +00002459 def _getmember(self, name, tarinfo=None, normalize=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002460 """Find an archive member by name from bottom to top.
2461 If tarinfo is given, it is used as the starting point.
2462 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002463 # Ensure that all members have been loaded.
2464 members = self.getmembers()
2465
Lars Gustäbel1b512722010-06-03 12:45:16 +00002466 # Limit the member search list up to tarinfo.
2467 if tarinfo is not None:
2468 members = members[:members.index(tarinfo)]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002469
Lars Gustäbel1b512722010-06-03 12:45:16 +00002470 if normalize:
2471 name = os.path.normpath(name)
2472
2473 for member in reversed(members):
2474 if normalize:
2475 member_name = os.path.normpath(member.name)
2476 else:
2477 member_name = member.name
2478
2479 if name == member_name:
2480 return member
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002481
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002482 def _load(self):
2483 """Read through the entire archive file and look for readable
2484 members.
2485 """
2486 while True:
2487 tarinfo = self.next()
2488 if tarinfo is None:
2489 break
2490 self._loaded = True
2491
2492 def _check(self, mode=None):
2493 """Check if TarFile is still open, and if the operation's mode
2494 corresponds to TarFile's mode.
2495 """
2496 if self.closed:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002497 raise IOError("%s is closed" % self.__class__.__name__)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002498 if mode is not None and self.mode not in mode:
2499 raise IOError("bad operation for mode %r" % self.mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002500
Lars Gustäbel1b512722010-06-03 12:45:16 +00002501 def _find_link_target(self, tarinfo):
2502 """Find the target member of a symlink or hardlink member in the
2503 archive.
2504 """
2505 if tarinfo.issym():
2506 # Always search the entire archive.
2507 linkname = os.path.dirname(tarinfo.name) + "/" + tarinfo.linkname
2508 limit = None
2509 else:
2510 # Search the archive before the link, because a hard link is
2511 # just a reference to an already archived file.
2512 linkname = tarinfo.linkname
2513 limit = tarinfo
2514
2515 member = self._getmember(linkname, tarinfo=limit, normalize=True)
2516 if member is None:
2517 raise KeyError("linkname %r not found" % linkname)
2518 return member
2519
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002520 def __iter__(self):
2521 """Provide an iterator object.
2522 """
2523 if self._loaded:
2524 return iter(self.members)
2525 else:
2526 return TarIter(self)
2527
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002528 def _dbg(self, level, msg):
2529 """Write debugging output to sys.stderr.
2530 """
2531 if level <= self.debug:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002532 print(msg, file=sys.stderr)
Lars Gustäbel01385812010-03-03 12:08:54 +00002533
2534 def __enter__(self):
2535 self._check()
2536 return self
2537
2538 def __exit__(self, type, value, traceback):
2539 if type is None:
2540 self.close()
2541 else:
2542 # An exception occurred. We must not call close() because
2543 # it would try to write end-of-archive blocks and padding.
2544 if not self._extfileobj:
2545 self.fileobj.close()
2546 self.closed = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002547# class TarFile
2548
2549class TarIter:
2550 """Iterator Class.
2551
2552 for tarinfo in TarFile(...):
2553 suite...
2554 """
2555
2556 def __init__(self, tarfile):
2557 """Construct a TarIter object.
2558 """
2559 self.tarfile = tarfile
Martin v. Löwis637431b2005-03-03 23:12:42 +00002560 self.index = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002561 def __iter__(self):
2562 """Return iterator object.
2563 """
2564 return self
Georg Brandla18af4e2007-04-21 15:47:16 +00002565 def __next__(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002566 """Return the next item using TarFile's next() method.
2567 When all members have been read, set TarFile as _loaded.
2568 """
Martin v. Löwis637431b2005-03-03 23:12:42 +00002569 # Fix for SF #1100429: Under rare circumstances it can
2570 # happen that getmembers() is called during iteration,
2571 # which will cause TarIter to stop prematurely.
2572 if not self.tarfile._loaded:
2573 tarinfo = self.tarfile.next()
2574 if not tarinfo:
2575 self.tarfile._loaded = True
2576 raise StopIteration
2577 else:
2578 try:
2579 tarinfo = self.tarfile.members[self.index]
2580 except IndexError:
2581 raise StopIteration
2582 self.index += 1
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002583 return tarinfo
2584
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002585#--------------------
2586# exported functions
2587#--------------------
2588def is_tarfile(name):
2589 """Return True if name points to a tar archive that we
2590 are able to handle, else return False.
2591 """
2592 try:
2593 t = open(name)
2594 t.close()
2595 return True
2596 except TarError:
2597 return False
2598
Guido van Rossume7ba4952007-06-06 23:52:48 +00002599bltn_open = open
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002600open = TarFile.open