blob: 6bdbf36e83013254a0ab7519a06f348f698428dd [file] [log] [blame]
Benjamin Peterson90f5ba52010-03-11 22:53:45 +00001#!/usr/bin/env python3
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
Christian Heimes9c1257e2007-11-04 11:37:22 +00005# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00006# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
Guido van Rossumd8faa362007-04-27 19:54:29 +000032version = "0.9.0"
Guido van Rossum98297ee2007-11-06 21:34:58 +000033__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)"
Senthil Kumaran7c9719c2011-07-28 22:32:49 +080034__date__ = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $"
35__cvsid__ = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $"
Guido van Rossum98297ee2007-11-06 21:34:58 +000036__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000037
38#---------
39# Imports
40#---------
41import sys
42import os
43import shutil
44import stat
45import errno
46import time
47import struct
Thomas Wouters89f507f2006-12-13 04:49:30 +000048import copy
Guido van Rossumd8faa362007-04-27 19:54:29 +000049import re
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000050
51try:
52 import grp, pwd
53except ImportError:
54 grp = pwd = None
55
Brian Curtin16633fa2010-07-09 13:54:27 +000056# os.symlink on Windows prior to 6.0 raises NotImplementedError
57symlink_exception = (AttributeError, NotImplementedError)
58try:
59 # WindowsError (1314) will be raised if the caller does not hold the
60 # SeCreateSymbolicLinkPrivilege privilege
61 symlink_exception += (WindowsError,)
62except NameError:
63 pass
64
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000065# from tarfile import *
66__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
67
Georg Brandl1a3284e2007-12-02 09:40:06 +000068from builtins import open as _open # Since 'open' is TarFile.open
Guido van Rossum8f78fe92006-08-24 04:03:53 +000069
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000070#---------------------------------------------------------
71# tar constants
72#---------------------------------------------------------
Lars Gustäbelb506dc32007-08-07 18:36:16 +000073NUL = b"\0" # the null character
Guido van Rossumd8faa362007-04-27 19:54:29 +000074BLOCKSIZE = 512 # length of processing blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000075RECORDSIZE = BLOCKSIZE * 20 # length of records
Lars Gustäbelb506dc32007-08-07 18:36:16 +000076GNU_MAGIC = b"ustar \0" # magic gnu tar string
77POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000078
Guido van Rossumd8faa362007-04-27 19:54:29 +000079LENGTH_NAME = 100 # maximum length of a filename
80LENGTH_LINK = 100 # maximum length of a linkname
81LENGTH_PREFIX = 155 # maximum length of the prefix field
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000082
Lars Gustäbelb506dc32007-08-07 18:36:16 +000083REGTYPE = b"0" # regular file
84AREGTYPE = b"\0" # regular file
85LNKTYPE = b"1" # link (inside tarfile)
86SYMTYPE = b"2" # symbolic link
87CHRTYPE = b"3" # character special device
88BLKTYPE = b"4" # block special device
89DIRTYPE = b"5" # directory
90FIFOTYPE = b"6" # fifo special device
91CONTTYPE = b"7" # contiguous file
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000092
Lars Gustäbelb506dc32007-08-07 18:36:16 +000093GNUTYPE_LONGNAME = b"L" # GNU tar longname
94GNUTYPE_LONGLINK = b"K" # GNU tar longlink
95GNUTYPE_SPARSE = b"S" # GNU tar sparse file
Guido van Rossumd8faa362007-04-27 19:54:29 +000096
Lars Gustäbelb506dc32007-08-07 18:36:16 +000097XHDTYPE = b"x" # POSIX.1-2001 extended header
98XGLTYPE = b"g" # POSIX.1-2001 global header
99SOLARIS_XHDTYPE = b"X" # Solaris extended header
Guido van Rossumd8faa362007-04-27 19:54:29 +0000100
101USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
102GNU_FORMAT = 1 # GNU tar format
103PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
104DEFAULT_FORMAT = GNU_FORMAT
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000105
106#---------------------------------------------------------
107# tarfile constants
108#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000109# File types that tarfile supports:
110SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
111 SYMTYPE, DIRTYPE, FIFOTYPE,
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000112 CONTTYPE, CHRTYPE, BLKTYPE,
113 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
114 GNUTYPE_SPARSE)
115
Guido van Rossumd8faa362007-04-27 19:54:29 +0000116# File types that will be treated as a regular file.
117REGULAR_TYPES = (REGTYPE, AREGTYPE,
118 CONTTYPE, GNUTYPE_SPARSE)
119
120# File types that are part of the GNU tar format.
121GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
122 GNUTYPE_SPARSE)
123
124# Fields from a pax header that override a TarInfo attribute.
125PAX_FIELDS = ("path", "linkpath", "size", "mtime",
126 "uid", "gid", "uname", "gname")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000127
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000128# Fields from a pax header that are affected by hdrcharset.
129PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
130
Guido van Rossume7ba4952007-06-06 23:52:48 +0000131# Fields in a pax header that are numbers, all other fields
132# are treated as strings.
133PAX_NUMBER_FIELDS = {
134 "atime": float,
135 "ctime": float,
136 "mtime": float,
137 "uid": int,
138 "gid": int,
139 "size": int
140}
141
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000142#---------------------------------------------------------
143# Bits used in the mode field, values in octal.
144#---------------------------------------------------------
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000145S_IFLNK = 0o120000 # symbolic link
146S_IFREG = 0o100000 # regular file
147S_IFBLK = 0o060000 # block device
148S_IFDIR = 0o040000 # directory
149S_IFCHR = 0o020000 # character device
150S_IFIFO = 0o010000 # fifo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000151
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000152TSUID = 0o4000 # set UID on execution
153TSGID = 0o2000 # set GID on execution
154TSVTX = 0o1000 # reserved
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000155
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000156TUREAD = 0o400 # read by owner
157TUWRITE = 0o200 # write by owner
158TUEXEC = 0o100 # execute/search by owner
159TGREAD = 0o040 # read by group
160TGWRITE = 0o020 # write by group
161TGEXEC = 0o010 # execute/search by group
162TOREAD = 0o004 # read by other
163TOWRITE = 0o002 # write by other
164TOEXEC = 0o001 # execute/search by other
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000165
166#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000167# initialization
168#---------------------------------------------------------
Victor Stinner0f35e2c2010-06-11 23:46:47 +0000169if os.name in ("nt", "ce"):
170 ENCODING = "utf-8"
171else:
172 ENCODING = sys.getfilesystemencoding()
Guido van Rossumd8faa362007-04-27 19:54:29 +0000173
174#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000175# Some useful functions
176#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000177
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000178def stn(s, length, encoding, errors):
179 """Convert a string to a null-terminated bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000180 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000181 s = s.encode(encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000182 return s[:length] + (length - len(s)) * NUL
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000184def nts(s, encoding, errors):
185 """Convert a null-terminated bytes object to a string.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000186 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000187 p = s.find(b"\0")
188 if p != -1:
189 s = s[:p]
190 return s.decode(encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000191
Thomas Wouters477c8d52006-05-27 19:21:47 +0000192def nti(s):
193 """Convert a number field to a python number.
194 """
195 # There are two possible encodings for a number field, see
196 # itn() below.
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000197 if s[0] != chr(0o200):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000198 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000199 n = int(nts(s, "ascii", "strict") or "0", 8)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000200 except ValueError:
Lars Gustäbel9520a432009-11-22 18:48:49 +0000201 raise InvalidHeaderError("invalid header")
Thomas Wouters477c8d52006-05-27 19:21:47 +0000202 else:
Guido van Rossume2a383d2007-01-15 16:59:06 +0000203 n = 0
Guido van Rossum805365e2007-05-07 22:24:25 +0000204 for i in range(len(s) - 1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000205 n <<= 8
206 n += ord(s[i + 1])
207 return n
208
Guido van Rossumd8faa362007-04-27 19:54:29 +0000209def itn(n, digits=8, format=DEFAULT_FORMAT):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000210 """Convert a python number to a number field.
211 """
212 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
213 # octal digits followed by a null-byte, this allows values up to
214 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000215 # that if necessary. A leading 0o200 byte indicates this particular
Thomas Wouters477c8d52006-05-27 19:21:47 +0000216 # encoding, the following digits-1 bytes are a big-endian
217 # representation. This allows values up to (256**(digits-1))-1.
218 if 0 <= n < 8 ** (digits - 1):
Lars Gustäbela280ca752007-08-28 07:34:33 +0000219 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
Thomas Wouters477c8d52006-05-27 19:21:47 +0000220 else:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000221 if format != GNU_FORMAT or n >= 256 ** (digits - 1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000222 raise ValueError("overflow in number field")
223
224 if n < 0:
225 # XXX We mimic GNU tar's behaviour with negative numbers,
226 # this could raise OverflowError.
227 n = struct.unpack("L", struct.pack("l", n))[0]
228
Guido van Rossum254348e2007-11-21 19:29:53 +0000229 s = bytearray()
Guido van Rossum805365e2007-05-07 22:24:25 +0000230 for i in range(digits - 1):
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000231 s.insert(0, n & 0o377)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 n >>= 8
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000233 s.insert(0, 0o200)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 return s
235
236def calc_chksums(buf):
237 """Calculate the checksum for a member's header by summing up all
238 characters except for the chksum field which is treated as if
239 it was filled with spaces. According to the GNU tar sources,
240 some tars (Sun and NeXT) calculate chksum with signed char,
241 which will be different if there are chars in the buffer with
242 the high bit set. So we calculate two checksums, unsigned and
243 signed.
244 """
245 unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
246 signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
247 return unsigned_chksum, signed_chksum
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000248
249def copyfileobj(src, dst, length=None):
250 """Copy length bytes from fileobj src to fileobj dst.
251 If length is None, copy the entire content.
252 """
253 if length == 0:
254 return
255 if length is None:
256 shutil.copyfileobj(src, dst)
257 return
258
259 BUFSIZE = 16 * 1024
260 blocks, remainder = divmod(length, BUFSIZE)
Guido van Rossum805365e2007-05-07 22:24:25 +0000261 for b in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000262 buf = src.read(BUFSIZE)
263 if len(buf) < BUFSIZE:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000264 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000265 dst.write(buf)
266
267 if remainder != 0:
268 buf = src.read(remainder)
269 if len(buf) < remainder:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000271 dst.write(buf)
272 return
273
274filemode_table = (
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000275 ((S_IFLNK, "l"),
276 (S_IFREG, "-"),
277 (S_IFBLK, "b"),
278 (S_IFDIR, "d"),
279 (S_IFCHR, "c"),
280 (S_IFIFO, "p")),
281
282 ((TUREAD, "r"),),
283 ((TUWRITE, "w"),),
284 ((TUEXEC|TSUID, "s"),
285 (TSUID, "S"),
286 (TUEXEC, "x")),
287
288 ((TGREAD, "r"),),
289 ((TGWRITE, "w"),),
290 ((TGEXEC|TSGID, "s"),
291 (TSGID, "S"),
292 (TGEXEC, "x")),
293
294 ((TOREAD, "r"),),
295 ((TOWRITE, "w"),),
296 ((TOEXEC|TSVTX, "t"),
297 (TSVTX, "T"),
298 (TOEXEC, "x"))
299)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000300
301def filemode(mode):
302 """Convert a file's mode to a string of the form
303 -rwxrwxrwx.
304 Used by TarFile.list()
305 """
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000306 perm = []
307 for table in filemode_table:
308 for bit, char in table:
309 if mode & bit == bit:
310 perm.append(char)
311 break
312 else:
313 perm.append("-")
314 return "".join(perm)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000315
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000316class TarError(Exception):
317 """Base exception."""
318 pass
319class ExtractError(TarError):
320 """General exception for extract errors."""
321 pass
322class ReadError(TarError):
323 """Exception for unreadble tar archives."""
324 pass
325class CompressionError(TarError):
326 """Exception for unavailable compression methods."""
327 pass
328class StreamError(TarError):
329 """Exception for unsupported operations on stream-like TarFiles."""
330 pass
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000331class HeaderError(TarError):
Lars Gustäbel9520a432009-11-22 18:48:49 +0000332 """Base exception for header errors."""
333 pass
334class EmptyHeaderError(HeaderError):
335 """Exception for empty headers."""
336 pass
337class TruncatedHeaderError(HeaderError):
338 """Exception for truncated headers."""
339 pass
340class EOFHeaderError(HeaderError):
341 """Exception for end of file headers."""
342 pass
343class InvalidHeaderError(HeaderError):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000344 """Exception for invalid headers."""
345 pass
Lars Gustäbel9520a432009-11-22 18:48:49 +0000346class SubsequentHeaderError(HeaderError):
347 """Exception for missing and invalid extended headers."""
348 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000349
350#---------------------------
351# internal stream interface
352#---------------------------
353class _LowLevelFile:
354 """Low-level file object. Supports reading and writing.
355 It is used instead of a regular file object for streaming
356 access.
357 """
358
359 def __init__(self, name, mode):
360 mode = {
361 "r": os.O_RDONLY,
362 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
363 }[mode]
364 if hasattr(os, "O_BINARY"):
365 mode |= os.O_BINARY
Lars Gustäbeld6eb70b2010-04-29 15:37:02 +0000366 self.fd = os.open(name, mode, 0o666)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000367
368 def close(self):
369 os.close(self.fd)
370
371 def read(self, size):
372 return os.read(self.fd, size)
373
374 def write(self, s):
375 os.write(self.fd, s)
376
377class _Stream:
378 """Class that serves as an adapter between TarFile and
379 a stream-like object. The stream-like object only
380 needs to have a read() or write() method and is accessed
381 blockwise. Use of gzip or bzip2 compression is possible.
382 A stream-like object could be for example: sys.stdin,
383 sys.stdout, a socket, a tape device etc.
384
385 _Stream is intended to be used only internally.
386 """
387
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000388 def __init__(self, name, mode, comptype, fileobj, bufsize):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000389 """Construct a _Stream object.
390 """
391 self._extfileobj = True
392 if fileobj is None:
393 fileobj = _LowLevelFile(name, mode)
394 self._extfileobj = False
395
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000396 if comptype == '*':
397 # Enable transparent compression detection for the
398 # stream interface
399 fileobj = _StreamProxy(fileobj)
400 comptype = fileobj.getcomptype()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000401
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000402 self.name = name or ""
403 self.mode = mode
404 self.comptype = comptype
405 self.fileobj = fileobj
406 self.bufsize = bufsize
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000407 self.buf = b""
Guido van Rossume2a383d2007-01-15 16:59:06 +0000408 self.pos = 0
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000409 self.closed = False
410
Antoine Pitrou605c2932010-09-23 20:15:14 +0000411 try:
412 if comptype == "gz":
413 try:
414 import zlib
415 except ImportError:
416 raise CompressionError("zlib module is not available")
417 self.zlib = zlib
418 self.crc = zlib.crc32(b"")
419 if mode == "r":
420 self._init_read_gz()
421 else:
422 self._init_write_gz()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000423
Antoine Pitrou605c2932010-09-23 20:15:14 +0000424 if comptype == "bz2":
425 try:
426 import bz2
427 except ImportError:
428 raise CompressionError("bz2 module is not available")
429 if mode == "r":
430 self.dbuf = b""
431 self.cmp = bz2.BZ2Decompressor()
432 else:
433 self.cmp = bz2.BZ2Compressor()
434 except:
435 if not self._extfileobj:
436 self.fileobj.close()
437 self.closed = True
438 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000439
440 def __del__(self):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000441 if hasattr(self, "closed") and not self.closed:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000442 self.close()
443
444 def _init_write_gz(self):
445 """Initialize for writing with gzip compression.
446 """
447 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
448 -self.zlib.MAX_WBITS,
449 self.zlib.DEF_MEM_LEVEL,
450 0)
Guido van Rossume2a383d2007-01-15 16:59:06 +0000451 timestamp = struct.pack("<L", int(time.time()))
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000452 self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000453 if self.name.endswith(".gz"):
454 self.name = self.name[:-3]
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000455 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
456 self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000457
458 def write(self, s):
459 """Write string s to the stream.
460 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000461 if self.comptype == "gz":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000462 self.crc = self.zlib.crc32(s, self.crc)
463 self.pos += len(s)
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000464 if self.comptype != "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000465 s = self.cmp.compress(s)
466 self.__write(s)
467
468 def __write(self, s):
469 """Write string s to the stream if a whole new block
470 is ready to be written.
471 """
472 self.buf += s
473 while len(self.buf) > self.bufsize:
474 self.fileobj.write(self.buf[:self.bufsize])
475 self.buf = self.buf[self.bufsize:]
476
477 def close(self):
478 """Close the _Stream object. No operation should be
479 done on it afterwards.
480 """
481 if self.closed:
482 return
483
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000484 if self.mode == "w" and self.comptype != "tar":
Martin v. Löwisc234a522004-08-22 21:28:33 +0000485 self.buf += self.cmp.flush()
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000486
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000487 if self.mode == "w" and self.buf:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000488 self.fileobj.write(self.buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000489 self.buf = b""
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000490 if self.comptype == "gz":
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000491 # The native zlib crc is an unsigned 32-bit integer, but
492 # the Python wrapper implicitly casts that to a signed C
493 # long. So, on a 32-bit box self.crc may "look negative",
494 # while the same crc on a 64-bit box may "look positive".
495 # To avoid irksome warnings from the `struct` module, force
496 # it to look positive on all boxes.
Guido van Rossume2a383d2007-01-15 16:59:06 +0000497 self.fileobj.write(struct.pack("<L", self.crc & 0xffffffff))
498 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000499
500 if not self._extfileobj:
501 self.fileobj.close()
502
503 self.closed = True
504
505 def _init_read_gz(self):
506 """Initialize for reading a gzip compressed fileobj.
507 """
508 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000509 self.dbuf = b""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000510
511 # taken from gzip.GzipFile with some alterations
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000512 if self.__read(2) != b"\037\213":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000513 raise ReadError("not a gzip file")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000514 if self.__read(1) != b"\010":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000515 raise CompressionError("unsupported compression method")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000516
517 flag = ord(self.__read(1))
518 self.__read(6)
519
520 if flag & 4:
521 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
522 self.read(xlen)
523 if flag & 8:
524 while True:
525 s = self.__read(1)
526 if not s or s == NUL:
527 break
528 if flag & 16:
529 while True:
530 s = self.__read(1)
531 if not s or s == NUL:
532 break
533 if flag & 2:
534 self.__read(2)
535
536 def tell(self):
537 """Return the stream's file pointer position.
538 """
539 return self.pos
540
541 def seek(self, pos=0):
542 """Set the stream's file pointer to pos. Negative seeking
543 is forbidden.
544 """
545 if pos - self.pos >= 0:
546 blocks, remainder = divmod(pos - self.pos, self.bufsize)
Guido van Rossum805365e2007-05-07 22:24:25 +0000547 for i in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000548 self.read(self.bufsize)
549 self.read(remainder)
550 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551 raise StreamError("seeking backwards is not allowed")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000552 return self.pos
553
554 def read(self, size=None):
555 """Return the next size number of bytes from the stream.
556 If size is not defined, return all bytes of the stream
557 up to EOF.
558 """
559 if size is None:
560 t = []
561 while True:
562 buf = self._read(self.bufsize)
563 if not buf:
564 break
565 t.append(buf)
566 buf = "".join(t)
567 else:
568 buf = self._read(size)
569 self.pos += len(buf)
570 return buf
571
572 def _read(self, size):
573 """Return size bytes from the stream.
574 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000575 if self.comptype == "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000576 return self.__read(size)
577
578 c = len(self.dbuf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000579 while c < size:
580 buf = self.__read(self.bufsize)
581 if not buf:
582 break
Guido van Rossumd8faa362007-04-27 19:54:29 +0000583 try:
584 buf = self.cmp.decompress(buf)
585 except IOError:
586 raise ReadError("invalid compressed data")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000587 self.dbuf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000588 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000589 buf = self.dbuf[:size]
590 self.dbuf = self.dbuf[size:]
591 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000592
593 def __read(self, size):
594 """Return size bytes from stream. If internal buffer is empty,
595 read another block from the stream.
596 """
597 c = len(self.buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000598 while c < size:
599 buf = self.fileobj.read(self.bufsize)
600 if not buf:
601 break
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000602 self.buf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000603 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000604 buf = self.buf[:size]
605 self.buf = self.buf[size:]
606 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000607# class _Stream
608
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000609class _StreamProxy(object):
610 """Small proxy class that enables transparent compression
611 detection for the Stream interface (mode 'r|*').
612 """
613
614 def __init__(self, fileobj):
615 self.fileobj = fileobj
616 self.buf = self.fileobj.read(BLOCKSIZE)
617
618 def read(self, size):
619 self.read = self.fileobj.read
620 return self.buf
621
622 def getcomptype(self):
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000623 if self.buf.startswith(b"\037\213\010"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000624 return "gz"
Lars Gustäbela280ca752007-08-28 07:34:33 +0000625 if self.buf.startswith(b"BZh91"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000626 return "bz2"
627 return "tar"
628
629 def close(self):
630 self.fileobj.close()
631# class StreamProxy
632
Thomas Wouters477c8d52006-05-27 19:21:47 +0000633class _BZ2Proxy(object):
634 """Small proxy class that enables external file object
635 support for "r:bz2" and "w:bz2" modes. This is actually
636 a workaround for a limitation in bz2 module's BZ2File
637 class which (unlike gzip.GzipFile) has no support for
638 a file object argument.
639 """
640
641 blocksize = 16 * 1024
642
643 def __init__(self, fileobj, mode):
644 self.fileobj = fileobj
645 self.mode = mode
Guido van Rossumd8faa362007-04-27 19:54:29 +0000646 self.name = getattr(self.fileobj, "name", None)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000647 self.init()
648
649 def init(self):
650 import bz2
651 self.pos = 0
652 if self.mode == "r":
653 self.bz2obj = bz2.BZ2Decompressor()
654 self.fileobj.seek(0)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000655 self.buf = b""
Thomas Wouters477c8d52006-05-27 19:21:47 +0000656 else:
657 self.bz2obj = bz2.BZ2Compressor()
658
659 def read(self, size):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000660 x = len(self.buf)
661 while x < size:
Lars Gustäbel42e00912009-03-22 20:34:29 +0000662 raw = self.fileobj.read(self.blocksize)
663 if not raw:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000664 break
Lars Gustäbel42e00912009-03-22 20:34:29 +0000665 data = self.bz2obj.decompress(raw)
666 self.buf += data
Thomas Wouters477c8d52006-05-27 19:21:47 +0000667 x += len(data)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000668
669 buf = self.buf[:size]
670 self.buf = self.buf[size:]
671 self.pos += len(buf)
672 return buf
673
674 def seek(self, pos):
675 if pos < self.pos:
676 self.init()
677 self.read(pos - self.pos)
678
679 def tell(self):
680 return self.pos
681
682 def write(self, data):
683 self.pos += len(data)
684 raw = self.bz2obj.compress(data)
685 self.fileobj.write(raw)
686
687 def close(self):
688 if self.mode == "w":
689 raw = self.bz2obj.flush()
690 self.fileobj.write(raw)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000691# class _BZ2Proxy
692
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000693#------------------------
694# Extraction file object
695#------------------------
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000696class _FileInFile(object):
697 """A thin wrapper around an existing file object that
698 provides a part of its data as an individual file
699 object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000700 """
701
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000702 def __init__(self, fileobj, offset, size, blockinfo=None):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000703 self.fileobj = fileobj
704 self.offset = offset
705 self.size = size
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000706 self.position = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000707
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000708 if blockinfo is None:
709 blockinfo = [(0, size)]
710
711 # Construct a map with data and zero blocks.
712 self.map_index = 0
713 self.map = []
714 lastpos = 0
715 realpos = self.offset
716 for offset, size in blockinfo:
717 if offset > lastpos:
718 self.map.append((False, lastpos, offset, None))
719 self.map.append((True, offset, offset + size, realpos))
720 realpos += size
721 lastpos = offset + size
722 if lastpos < self.size:
723 self.map.append((False, lastpos, self.size, None))
724
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000725 def seekable(self):
726 if not hasattr(self.fileobj, "seekable"):
727 # XXX gzip.GzipFile and bz2.BZ2File
728 return True
729 return self.fileobj.seekable()
730
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000731 def tell(self):
732 """Return the current file position.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000733 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000734 return self.position
735
736 def seek(self, position):
737 """Seek to a position in the file.
738 """
739 self.position = position
740
741 def read(self, size=None):
742 """Read data from the file.
743 """
744 if size is None:
745 size = self.size - self.position
746 else:
747 size = min(size, self.size - self.position)
748
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000749 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000750 while size > 0:
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000751 while True:
752 data, start, stop, offset = self.map[self.map_index]
753 if start <= self.position < stop:
754 break
755 else:
756 self.map_index += 1
757 if self.map_index == len(self.map):
758 self.map_index = 0
759 length = min(size, stop - self.position)
760 if data:
Lars Gustäbeldd071042011-02-23 11:42:22 +0000761 self.fileobj.seek(offset + (self.position - start))
762 buf += self.fileobj.read(length)
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000763 else:
764 buf += NUL * length
765 size -= length
766 self.position += length
767 return buf
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000768#class _FileInFile
769
770
771class ExFileObject(object):
772 """File-like object for reading an archive member.
773 Is returned by TarFile.extractfile().
774 """
775 blocksize = 1024
776
777 def __init__(self, tarfile, tarinfo):
778 self.fileobj = _FileInFile(tarfile.fileobj,
779 tarinfo.offset_data,
780 tarinfo.size,
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000781 tarinfo.sparse)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000782 self.name = tarinfo.name
783 self.mode = "r"
784 self.closed = False
785 self.size = tarinfo.size
786
787 self.position = 0
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000788 self.buffer = b""
789
790 def readable(self):
791 return True
792
793 def writable(self):
794 return False
795
796 def seekable(self):
797 return self.fileobj.seekable()
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000798
799 def read(self, size=None):
800 """Read at most size bytes from the file. If size is not
801 present or None, read all data until EOF is reached.
802 """
803 if self.closed:
804 raise ValueError("I/O operation on closed file")
805
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000806 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000807 if self.buffer:
808 if size is None:
809 buf = self.buffer
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000810 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000811 else:
812 buf = self.buffer[:size]
813 self.buffer = self.buffer[size:]
814
815 if size is None:
816 buf += self.fileobj.read()
817 else:
818 buf += self.fileobj.read(size - len(buf))
819
820 self.position += len(buf)
821 return buf
822
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000823 # XXX TextIOWrapper uses the read1() method.
824 read1 = read
825
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000826 def readline(self, size=-1):
827 """Read one entire line from the file. If size is present
828 and non-negative, return a string with at most that
829 size, which may be an incomplete line.
830 """
831 if self.closed:
832 raise ValueError("I/O operation on closed file")
833
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000834 pos = self.buffer.find(b"\n") + 1
835 if pos == 0:
836 # no newline found.
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000837 while True:
838 buf = self.fileobj.read(self.blocksize)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000839 self.buffer += buf
840 if not buf or b"\n" in buf:
841 pos = self.buffer.find(b"\n") + 1
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000842 if pos == 0:
843 # no newline found.
844 pos = len(self.buffer)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000845 break
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000846
847 if size != -1:
848 pos = min(size, pos)
849
850 buf = self.buffer[:pos]
851 self.buffer = self.buffer[pos:]
852 self.position += len(buf)
853 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000854
855 def readlines(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000856 """Return a list with all remaining lines.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000857 """
858 result = []
859 while True:
860 line = self.readline()
861 if not line: break
862 result.append(line)
863 return result
864
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000865 def tell(self):
866 """Return the current file position.
867 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000868 if self.closed:
869 raise ValueError("I/O operation on closed file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000870
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000871 return self.position
872
873 def seek(self, pos, whence=os.SEEK_SET):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000874 """Seek to a position in the file.
875 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000876 if self.closed:
877 raise ValueError("I/O operation on closed file")
878
879 if whence == os.SEEK_SET:
880 self.position = min(max(pos, 0), self.size)
881 elif whence == os.SEEK_CUR:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000882 if pos < 0:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000883 self.position = max(self.position + pos, 0)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000884 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000885 self.position = min(self.position + pos, self.size)
886 elif whence == os.SEEK_END:
887 self.position = max(min(self.size + pos, self.size), 0)
888 else:
889 raise ValueError("Invalid argument")
890
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000891 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000892 self.fileobj.seek(self.position)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000893
894 def close(self):
895 """Close the file object.
896 """
897 self.closed = True
Martin v. Löwisdf241532005-03-03 08:17:42 +0000898
899 def __iter__(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000900 """Get an iterator over the file's lines.
Martin v. Löwisdf241532005-03-03 08:17:42 +0000901 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000902 while True:
903 line = self.readline()
904 if not line:
905 break
906 yield line
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000907#class ExFileObject
908
909#------------------
910# Exported Classes
911#------------------
912class TarInfo(object):
913 """Informational class which holds the details about an
914 archive member given by a tar header block.
915 TarInfo objects are returned by TarFile.getmember(),
916 TarFile.getmembers() and TarFile.gettarinfo() and are
917 usually created internally.
918 """
919
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000920 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
921 "chksum", "type", "linkname", "uname", "gname",
922 "devmajor", "devminor",
923 "offset", "offset_data", "pax_headers", "sparse",
924 "tarfile", "_sparse_structs", "_link_target")
925
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000926 def __init__(self, name=""):
927 """Construct a TarInfo object. name is the optional name
928 of the member.
929 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000930 self.name = name # member name
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000931 self.mode = 0o644 # file permissions
Thomas Wouters477c8d52006-05-27 19:21:47 +0000932 self.uid = 0 # user id
933 self.gid = 0 # group id
934 self.size = 0 # file size
935 self.mtime = 0 # modification time
936 self.chksum = 0 # header checksum
937 self.type = REGTYPE # member type
938 self.linkname = "" # link name
Lars Gustäbel331b8002010-10-04 15:18:47 +0000939 self.uname = "" # user name
940 self.gname = "" # group name
Thomas Wouters477c8d52006-05-27 19:21:47 +0000941 self.devmajor = 0 # device major number
942 self.devminor = 0 # device minor number
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000943
Thomas Wouters477c8d52006-05-27 19:21:47 +0000944 self.offset = 0 # the tar header starts here
945 self.offset_data = 0 # the file's data starts here
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000946
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000947 self.sparse = None # sparse member information
Guido van Rossumd8faa362007-04-27 19:54:29 +0000948 self.pax_headers = {} # pax header information
949
950 # In pax headers the "name" and "linkname" field are called
951 # "path" and "linkpath".
952 def _getpath(self):
953 return self.name
954 def _setpath(self, name):
955 self.name = name
956 path = property(_getpath, _setpath)
957
958 def _getlinkpath(self):
959 return self.linkname
960 def _setlinkpath(self, linkname):
961 self.linkname = linkname
962 linkpath = property(_getlinkpath, _setlinkpath)
963
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000964 def __repr__(self):
965 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
966
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000967 def get_info(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000968 """Return the TarInfo's attributes as a dictionary.
969 """
970 info = {
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000971 "name": self.name,
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000972 "mode": self.mode & 0o7777,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000973 "uid": self.uid,
974 "gid": self.gid,
975 "size": self.size,
976 "mtime": self.mtime,
977 "chksum": self.chksum,
978 "type": self.type,
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000979 "linkname": self.linkname,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000980 "uname": self.uname,
981 "gname": self.gname,
982 "devmajor": self.devmajor,
983 "devminor": self.devminor
984 }
985
986 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
987 info["name"] += "/"
988
989 return info
990
Victor Stinnerde629d42010-05-05 21:43:57 +0000991 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000992 """Return a tar header as a string of 512 byte blocks.
993 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000994 info = self.get_info()
Guido van Rossume7ba4952007-06-06 23:52:48 +0000995
Guido van Rossumd8faa362007-04-27 19:54:29 +0000996 if format == USTAR_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000997 return self.create_ustar_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000998 elif format == GNU_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000999 return self.create_gnu_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001000 elif format == PAX_FORMAT:
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001001 return self.create_pax_header(info, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001002 else:
1003 raise ValueError("invalid format")
1004
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001005 def create_ustar_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001006 """Return the object as a ustar header block.
1007 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001008 info["magic"] = POSIX_MAGIC
1009
1010 if len(info["linkname"]) > LENGTH_LINK:
1011 raise ValueError("linkname is too long")
1012
1013 if len(info["name"]) > LENGTH_NAME:
1014 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1015
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001016 return self._create_header(info, USTAR_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001017
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001018 def create_gnu_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001019 """Return the object as a GNU header block sequence.
1020 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001021 info["magic"] = GNU_MAGIC
1022
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001023 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +00001024 if len(info["linkname"]) > LENGTH_LINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001025 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001026
1027 if len(info["name"]) > LENGTH_NAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001028 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001029
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001030 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001031
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001032 def create_pax_header(self, info, encoding):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001033 """Return the object as a ustar header block. If it cannot be
1034 represented this way, prepend a pax extended header sequence
1035 with supplement information.
1036 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001037 info["magic"] = POSIX_MAGIC
1038 pax_headers = self.pax_headers.copy()
1039
1040 # Test string fields for values that exceed the field length or cannot
1041 # be represented in ASCII encoding.
1042 for name, hname, length in (
1043 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
1044 ("uname", "uname", 32), ("gname", "gname", 32)):
1045
Guido van Rossume7ba4952007-06-06 23:52:48 +00001046 if hname in pax_headers:
1047 # The pax header has priority.
1048 continue
1049
Guido van Rossumd8faa362007-04-27 19:54:29 +00001050 # Try to encode the string as ASCII.
1051 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001052 info[name].encode("ascii", "strict")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001053 except UnicodeEncodeError:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001054 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001055 continue
1056
Guido van Rossume7ba4952007-06-06 23:52:48 +00001057 if len(info[name]) > length:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001058 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001059
1060 # Test number fields for values that exceed the field limit or values
1061 # that like to be stored as float.
1062 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001063 if name in pax_headers:
1064 # The pax header has priority. Avoid overflow.
1065 info[name] = 0
1066 continue
1067
Guido van Rossumd8faa362007-04-27 19:54:29 +00001068 val = info[name]
1069 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001070 pax_headers[name] = str(val)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001071 info[name] = 0
1072
Guido van Rossume7ba4952007-06-06 23:52:48 +00001073 # Create a pax extended header if necessary.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001074 if pax_headers:
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001075 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001076 else:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001077 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +00001078
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001079 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001080
1081 @classmethod
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001082 def create_pax_global_header(cls, pax_headers):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001083 """Return the object as a pax global header block sequence.
1084 """
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001085 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001086
1087 def _posix_split_name(self, name):
1088 """Split a name longer than 100 chars into a prefix
1089 and a name part.
1090 """
1091 prefix = name[:LENGTH_PREFIX + 1]
1092 while prefix and prefix[-1] != "/":
1093 prefix = prefix[:-1]
1094
1095 name = name[len(prefix):]
1096 prefix = prefix[:-1]
1097
1098 if not prefix or len(name) > LENGTH_NAME:
1099 raise ValueError("name is too long")
1100 return prefix, name
1101
1102 @staticmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001103 def _create_header(info, format, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001104 """Return a header block. info is a dictionary with file
1105 information, format must be one of the *_FORMAT constants.
1106 """
1107 parts = [
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001108 stn(info.get("name", ""), 100, encoding, errors),
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001109 itn(info.get("mode", 0) & 0o7777, 8, format),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001110 itn(info.get("uid", 0), 8, format),
1111 itn(info.get("gid", 0), 8, format),
1112 itn(info.get("size", 0), 12, format),
1113 itn(info.get("mtime", 0), 12, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001114 b" ", # checksum field
Guido van Rossumd8faa362007-04-27 19:54:29 +00001115 info.get("type", REGTYPE),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001116 stn(info.get("linkname", ""), 100, encoding, errors),
1117 info.get("magic", POSIX_MAGIC),
Lars Gustäbel331b8002010-10-04 15:18:47 +00001118 stn(info.get("uname", ""), 32, encoding, errors),
1119 stn(info.get("gname", ""), 32, encoding, errors),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001120 itn(info.get("devmajor", 0), 8, format),
1121 itn(info.get("devminor", 0), 8, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001122 stn(info.get("prefix", ""), 155, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001123 ]
1124
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001125 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001126 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
Lars Gustäbela280ca752007-08-28 07:34:33 +00001127 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001128 return buf
1129
1130 @staticmethod
1131 def _create_payload(payload):
1132 """Return the string payload filled with zero bytes
1133 up to the next 512 byte border.
1134 """
1135 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1136 if remainder > 0:
1137 payload += (BLOCKSIZE - remainder) * NUL
1138 return payload
1139
1140 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001141 def _create_gnu_long_header(cls, name, type, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001142 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1143 for name.
1144 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001145 name = name.encode(encoding, errors) + NUL
Guido van Rossumd8faa362007-04-27 19:54:29 +00001146
1147 info = {}
1148 info["name"] = "././@LongLink"
1149 info["type"] = type
1150 info["size"] = len(name)
1151 info["magic"] = GNU_MAGIC
1152
1153 # create extended header + name blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001154 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001155 cls._create_payload(name)
1156
1157 @classmethod
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001158 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1159 """Return a POSIX.1-2008 extended or global header sequence
Guido van Rossumd8faa362007-04-27 19:54:29 +00001160 that contains a list of keyword, value pairs. The values
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001161 must be strings.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001162 """
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001163 # Check if one of the fields contains surrogate characters and thereby
1164 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1165 binary = False
1166 for keyword, value in pax_headers.items():
1167 try:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001168 value.encode("utf-8", "strict")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001169 except UnicodeEncodeError:
1170 binary = True
1171 break
1172
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001173 records = b""
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001174 if binary:
1175 # Put the hdrcharset field at the beginning of the header.
1176 records += b"21 hdrcharset=BINARY\n"
1177
Guido van Rossumd8faa362007-04-27 19:54:29 +00001178 for keyword, value in pax_headers.items():
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001179 keyword = keyword.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001180 if binary:
1181 # Try to restore the original byte representation of `value'.
1182 # Needless to say, that the encoding must match the string.
1183 value = value.encode(encoding, "surrogateescape")
1184 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001185 value = value.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001186
Guido van Rossumd8faa362007-04-27 19:54:29 +00001187 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1188 n = p = 0
1189 while True:
1190 n = l + len(str(p))
1191 if n == p:
1192 break
1193 p = n
Lars Gustäbela280ca752007-08-28 07:34:33 +00001194 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
Guido van Rossumd8faa362007-04-27 19:54:29 +00001195
1196 # We use a hardcoded "././@PaxHeader" name like star does
1197 # instead of the one that POSIX recommends.
1198 info = {}
1199 info["name"] = "././@PaxHeader"
1200 info["type"] = type
1201 info["size"] = len(records)
1202 info["magic"] = POSIX_MAGIC
1203
1204 # Create pax header + record blocks.
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001205 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001206 cls._create_payload(records)
1207
Guido van Rossum75b64e62005-01-16 00:16:11 +00001208 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001209 def frombuf(cls, buf, encoding, errors):
1210 """Construct a TarInfo object from a 512 byte bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001211 """
Lars Gustäbel9520a432009-11-22 18:48:49 +00001212 if len(buf) == 0:
1213 raise EmptyHeaderError("empty header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001214 if len(buf) != BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001215 raise TruncatedHeaderError("truncated header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001216 if buf.count(NUL) == BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001217 raise EOFHeaderError("end of file header")
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001218
1219 chksum = nti(buf[148:156])
1220 if chksum not in calc_chksums(buf):
Lars Gustäbel9520a432009-11-22 18:48:49 +00001221 raise InvalidHeaderError("bad checksum")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001222
Guido van Rossumd8faa362007-04-27 19:54:29 +00001223 obj = cls()
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001224 obj.name = nts(buf[0:100], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001225 obj.mode = nti(buf[100:108])
1226 obj.uid = nti(buf[108:116])
1227 obj.gid = nti(buf[116:124])
1228 obj.size = nti(buf[124:136])
1229 obj.mtime = nti(buf[136:148])
1230 obj.chksum = chksum
1231 obj.type = buf[156:157]
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001232 obj.linkname = nts(buf[157:257], encoding, errors)
1233 obj.uname = nts(buf[265:297], encoding, errors)
1234 obj.gname = nts(buf[297:329], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001235 obj.devmajor = nti(buf[329:337])
1236 obj.devminor = nti(buf[337:345])
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001237 prefix = nts(buf[345:500], encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001238
Guido van Rossumd8faa362007-04-27 19:54:29 +00001239 # Old V7 tar format represents a directory as a regular
1240 # file with a trailing slash.
1241 if obj.type == AREGTYPE and obj.name.endswith("/"):
1242 obj.type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001243
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001244 # The old GNU sparse format occupies some of the unused
1245 # space in the buffer for up to 4 sparse structures.
1246 # Save the them for later processing in _proc_sparse().
1247 if obj.type == GNUTYPE_SPARSE:
1248 pos = 386
1249 structs = []
1250 for i in range(4):
1251 try:
1252 offset = nti(buf[pos:pos + 12])
1253 numbytes = nti(buf[pos + 12:pos + 24])
1254 except ValueError:
1255 break
1256 structs.append((offset, numbytes))
1257 pos += 24
1258 isextended = bool(buf[482])
1259 origsize = nti(buf[483:495])
1260 obj._sparse_structs = (structs, isextended, origsize)
1261
Guido van Rossumd8faa362007-04-27 19:54:29 +00001262 # Remove redundant slashes from directories.
1263 if obj.isdir():
1264 obj.name = obj.name.rstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001265
Guido van Rossumd8faa362007-04-27 19:54:29 +00001266 # Reconstruct a ustar longname.
1267 if prefix and obj.type not in GNU_TYPES:
1268 obj.name = prefix + "/" + obj.name
1269 return obj
1270
1271 @classmethod
1272 def fromtarfile(cls, tarfile):
1273 """Return the next TarInfo object from TarFile object
1274 tarfile.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001275 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001276 buf = tarfile.fileobj.read(BLOCKSIZE)
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001277 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001278 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1279 return obj._proc_member(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001280
Guido van Rossumd8faa362007-04-27 19:54:29 +00001281 #--------------------------------------------------------------------------
1282 # The following are methods that are called depending on the type of a
1283 # member. The entry point is _proc_member() which can be overridden in a
1284 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1285 # implement the following
1286 # operations:
1287 # 1. Set self.offset_data to the position where the data blocks begin,
1288 # if there is data that follows.
1289 # 2. Set tarfile.offset to the position where the next member's header will
1290 # begin.
1291 # 3. Return self or another valid TarInfo object.
1292 def _proc_member(self, tarfile):
1293 """Choose the right processing method depending on
1294 the type and call it.
Thomas Wouters89f507f2006-12-13 04:49:30 +00001295 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001296 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1297 return self._proc_gnulong(tarfile)
1298 elif self.type == GNUTYPE_SPARSE:
1299 return self._proc_sparse(tarfile)
1300 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1301 return self._proc_pax(tarfile)
1302 else:
1303 return self._proc_builtin(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001304
Guido van Rossumd8faa362007-04-27 19:54:29 +00001305 def _proc_builtin(self, tarfile):
1306 """Process a builtin type or an unknown type which
1307 will be treated as a regular file.
1308 """
1309 self.offset_data = tarfile.fileobj.tell()
1310 offset = self.offset_data
1311 if self.isreg() or self.type not in SUPPORTED_TYPES:
1312 # Skip the following data blocks.
1313 offset += self._block(self.size)
1314 tarfile.offset = offset
Thomas Wouters89f507f2006-12-13 04:49:30 +00001315
Guido van Rossume7ba4952007-06-06 23:52:48 +00001316 # Patch the TarInfo object with saved global
Guido van Rossumd8faa362007-04-27 19:54:29 +00001317 # header information.
Guido van Rossume7ba4952007-06-06 23:52:48 +00001318 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001319
1320 return self
1321
1322 def _proc_gnulong(self, tarfile):
1323 """Process the blocks that hold a GNU longname
1324 or longlink member.
1325 """
1326 buf = tarfile.fileobj.read(self._block(self.size))
1327
1328 # Fetch the next header and process it.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001329 try:
1330 next = self.fromtarfile(tarfile)
1331 except HeaderError:
1332 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001333
1334 # Patch the TarInfo object from the next header with
1335 # the longname information.
1336 next.offset = self.offset
1337 if self.type == GNUTYPE_LONGNAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001338 next.name = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001339 elif self.type == GNUTYPE_LONGLINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001340 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001341
1342 return next
1343
1344 def _proc_sparse(self, tarfile):
1345 """Process a GNU sparse header plus extra headers.
1346 """
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001347 # We already collected some sparse structures in frombuf().
1348 structs, isextended, origsize = self._sparse_structs
1349 del self._sparse_structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001350
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001351 # Collect sparse structures from extended header blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001352 while isextended:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001353 buf = tarfile.fileobj.read(BLOCKSIZE)
1354 pos = 0
Guido van Rossum805365e2007-05-07 22:24:25 +00001355 for i in range(21):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001356 try:
1357 offset = nti(buf[pos:pos + 12])
1358 numbytes = nti(buf[pos + 12:pos + 24])
1359 except ValueError:
1360 break
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001361 if offset and numbytes:
1362 structs.append((offset, numbytes))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001363 pos += 24
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001364 isextended = bool(buf[504])
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001365 self.sparse = structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001366
1367 self.offset_data = tarfile.fileobj.tell()
1368 tarfile.offset = self.offset_data + self._block(self.size)
1369 self.size = origsize
Guido van Rossumd8faa362007-04-27 19:54:29 +00001370 return self
1371
1372 def _proc_pax(self, tarfile):
1373 """Process an extended or global header as described in
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001374 POSIX.1-2008.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001375 """
1376 # Read the header information.
1377 buf = tarfile.fileobj.read(self._block(self.size))
1378
1379 # A pax header stores supplemental information for either
1380 # the following file (extended) or all following files
1381 # (global).
1382 if self.type == XGLTYPE:
1383 pax_headers = tarfile.pax_headers
1384 else:
1385 pax_headers = tarfile.pax_headers.copy()
1386
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001387 # Check if the pax header contains a hdrcharset field. This tells us
1388 # the encoding of the path, linkpath, uname and gname fields. Normally,
1389 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1390 # implementations are allowed to store them as raw binary strings if
1391 # the translation to UTF-8 fails.
1392 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1393 if match is not None:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001394 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001395
1396 # For the time being, we don't care about anything other than "BINARY".
1397 # The only other value that is currently allowed by the standard is
1398 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1399 hdrcharset = pax_headers.get("hdrcharset")
1400 if hdrcharset == "BINARY":
1401 encoding = tarfile.encoding
1402 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001403 encoding = "utf-8"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001404
Guido van Rossumd8faa362007-04-27 19:54:29 +00001405 # Parse pax header information. A record looks like that:
1406 # "%d %s=%s\n" % (length, keyword, value). length is the size
1407 # of the complete record including the length field itself and
Guido van Rossume7ba4952007-06-06 23:52:48 +00001408 # the newline. keyword and value are both UTF-8 encoded strings.
Antoine Pitroufd036452008-08-19 17:56:33 +00001409 regex = re.compile(br"(\d+) ([^=]+)=")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001410 pos = 0
1411 while True:
1412 match = regex.match(buf, pos)
1413 if not match:
1414 break
1415
1416 length, keyword = match.groups()
1417 length = int(length)
1418 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1419
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001420 # Normally, we could just use "utf-8" as the encoding and "strict"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001421 # as the error handler, but we better not take the risk. For
1422 # example, GNU tar <= 1.23 is known to store filenames it cannot
1423 # translate to UTF-8 as raw strings (unfortunately without a
1424 # hdrcharset=BINARY header).
1425 # We first try the strict standard encoding, and if that fails we
1426 # fall back on the user's encoding and error handler.
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001427 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001428 tarfile.errors)
1429 if keyword in PAX_NAME_FIELDS:
1430 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1431 tarfile.errors)
1432 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001433 value = self._decode_pax_field(value, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001434 tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001435
1436 pax_headers[keyword] = value
1437 pos += length
1438
Guido van Rossume7ba4952007-06-06 23:52:48 +00001439 # Fetch the next header.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001440 try:
1441 next = self.fromtarfile(tarfile)
1442 except HeaderError:
1443 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001444
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001445 # Process GNU sparse information.
1446 if "GNU.sparse.map" in pax_headers:
1447 # GNU extended sparse format version 0.1.
1448 self._proc_gnusparse_01(next, pax_headers)
1449
1450 elif "GNU.sparse.size" in pax_headers:
1451 # GNU extended sparse format version 0.0.
1452 self._proc_gnusparse_00(next, pax_headers, buf)
1453
1454 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1455 # GNU extended sparse format version 1.0.
1456 self._proc_gnusparse_10(next, pax_headers, tarfile)
1457
Guido van Rossume7ba4952007-06-06 23:52:48 +00001458 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001459 # Patch the TarInfo object with the extended header info.
1460 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1461 next.offset = self.offset
1462
1463 if "size" in pax_headers:
1464 # If the extended header replaces the size field,
1465 # we need to recalculate the offset where the next
1466 # header starts.
1467 offset = next.offset_data
1468 if next.isreg() or next.type not in SUPPORTED_TYPES:
1469 offset += next._block(next.size)
1470 tarfile.offset = offset
1471
1472 return next
1473
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001474 def _proc_gnusparse_00(self, next, pax_headers, buf):
1475 """Process a GNU tar extended sparse header, version 0.0.
1476 """
1477 offsets = []
1478 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1479 offsets.append(int(match.group(1)))
1480 numbytes = []
1481 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1482 numbytes.append(int(match.group(1)))
1483 next.sparse = list(zip(offsets, numbytes))
1484
1485 def _proc_gnusparse_01(self, next, pax_headers):
1486 """Process a GNU tar extended sparse header, version 0.1.
1487 """
1488 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1489 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1490
1491 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1492 """Process a GNU tar extended sparse header, version 1.0.
1493 """
1494 fields = None
1495 sparse = []
1496 buf = tarfile.fileobj.read(BLOCKSIZE)
1497 fields, buf = buf.split(b"\n", 1)
1498 fields = int(fields)
1499 while len(sparse) < fields * 2:
1500 if b"\n" not in buf:
1501 buf += tarfile.fileobj.read(BLOCKSIZE)
1502 number, buf = buf.split(b"\n", 1)
1503 sparse.append(int(number))
1504 next.offset_data = tarfile.fileobj.tell()
1505 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1506
Guido van Rossume7ba4952007-06-06 23:52:48 +00001507 def _apply_pax_info(self, pax_headers, encoding, errors):
1508 """Replace fields with supplemental information from a previous
1509 pax extended or global header.
1510 """
1511 for keyword, value in pax_headers.items():
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001512 if keyword == "GNU.sparse.name":
1513 setattr(self, "path", value)
1514 elif keyword == "GNU.sparse.size":
1515 setattr(self, "size", int(value))
1516 elif keyword == "GNU.sparse.realsize":
1517 setattr(self, "size", int(value))
1518 elif keyword in PAX_FIELDS:
1519 if keyword in PAX_NUMBER_FIELDS:
1520 try:
1521 value = PAX_NUMBER_FIELDS[keyword](value)
1522 except ValueError:
1523 value = 0
1524 if keyword == "path":
1525 value = value.rstrip("/")
1526 setattr(self, keyword, value)
Guido van Rossume7ba4952007-06-06 23:52:48 +00001527
1528 self.pax_headers = pax_headers.copy()
Guido van Rossumd8faa362007-04-27 19:54:29 +00001529
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001530 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1531 """Decode a single field from a pax record.
1532 """
1533 try:
1534 return value.decode(encoding, "strict")
1535 except UnicodeDecodeError:
1536 return value.decode(fallback_encoding, fallback_errors)
1537
Guido van Rossumd8faa362007-04-27 19:54:29 +00001538 def _block(self, count):
1539 """Round up a byte count by BLOCKSIZE and return it,
1540 e.g. _block(834) => 1024.
1541 """
1542 blocks, remainder = divmod(count, BLOCKSIZE)
1543 if remainder:
1544 blocks += 1
1545 return blocks * BLOCKSIZE
Thomas Wouters89f507f2006-12-13 04:49:30 +00001546
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001547 def isreg(self):
1548 return self.type in REGULAR_TYPES
1549 def isfile(self):
1550 return self.isreg()
1551 def isdir(self):
1552 return self.type == DIRTYPE
1553 def issym(self):
1554 return self.type == SYMTYPE
1555 def islnk(self):
1556 return self.type == LNKTYPE
1557 def ischr(self):
1558 return self.type == CHRTYPE
1559 def isblk(self):
1560 return self.type == BLKTYPE
1561 def isfifo(self):
1562 return self.type == FIFOTYPE
1563 def issparse(self):
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001564 return self.sparse is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001565 def isdev(self):
1566 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1567# class TarInfo
1568
1569class TarFile(object):
1570 """The TarFile Class provides an interface to tar archives.
1571 """
1572
1573 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1574
1575 dereference = False # If true, add content of linked file to the
1576 # tar file, else the link.
1577
1578 ignore_zeros = False # If true, skips empty or invalid blocks and
1579 # continues processing.
1580
Lars Gustäbel365aff32009-12-13 11:42:29 +00001581 errorlevel = 1 # If 0, fatal errors only appear in debug
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001582 # messages (if debug >= 0). If > 0, errors
1583 # are passed to the caller as exceptions.
1584
Guido van Rossumd8faa362007-04-27 19:54:29 +00001585 format = DEFAULT_FORMAT # The format to use when creating an archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001586
Guido van Rossume7ba4952007-06-06 23:52:48 +00001587 encoding = ENCODING # Encoding for 8-bit character strings.
1588
1589 errors = None # Error handler for unicode conversion.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001590
Guido van Rossumd8faa362007-04-27 19:54:29 +00001591 tarinfo = TarInfo # The default TarInfo class to use.
1592
1593 fileobject = ExFileObject # The default ExFileObject class to use.
1594
1595 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1596 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
Victor Stinnerde629d42010-05-05 21:43:57 +00001597 errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001598 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1599 read from an existing archive, 'a' to append data to an existing
1600 file or 'w' to create a new file overwriting an existing one. `mode'
1601 defaults to 'r'.
1602 If `fileobj' is given, it is used for reading or writing data. If it
1603 can be determined, `mode' is overridden by `fileobj's mode.
1604 `fileobj' is not closed, when TarFile is closed.
1605 """
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001606 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001607 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001608 self.mode = mode
1609 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001610
1611 if not fileobj:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001612 if self.mode == "a" and not os.path.exists(name):
Thomas Wouterscf297e42007-02-23 15:07:44 +00001613 # Create nonexistent files in append mode.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001614 self.mode = "w"
1615 self._mode = "wb"
Guido van Rossume7ba4952007-06-06 23:52:48 +00001616 fileobj = bltn_open(name, self._mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001617 self._extfileobj = False
1618 else:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001619 if name is None and hasattr(fileobj, "name"):
1620 name = fileobj.name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001621 if hasattr(fileobj, "mode"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001622 self._mode = fileobj.mode
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001623 self._extfileobj = True
Thomas Woutersed03b412007-08-28 21:37:11 +00001624 self.name = os.path.abspath(name) if name else None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001625 self.fileobj = fileobj
1626
Guido van Rossumd8faa362007-04-27 19:54:29 +00001627 # Init attributes.
1628 if format is not None:
1629 self.format = format
1630 if tarinfo is not None:
1631 self.tarinfo = tarinfo
1632 if dereference is not None:
1633 self.dereference = dereference
1634 if ignore_zeros is not None:
1635 self.ignore_zeros = ignore_zeros
1636 if encoding is not None:
1637 self.encoding = encoding
Victor Stinnerde629d42010-05-05 21:43:57 +00001638 self.errors = errors
Guido van Rossume7ba4952007-06-06 23:52:48 +00001639
1640 if pax_headers is not None and self.format == PAX_FORMAT:
1641 self.pax_headers = pax_headers
1642 else:
1643 self.pax_headers = {}
1644
Guido van Rossumd8faa362007-04-27 19:54:29 +00001645 if debug is not None:
1646 self.debug = debug
1647 if errorlevel is not None:
1648 self.errorlevel = errorlevel
1649
1650 # Init datastructures.
Thomas Wouters477c8d52006-05-27 19:21:47 +00001651 self.closed = False
1652 self.members = [] # list of members as TarInfo objects
1653 self._loaded = False # flag if all members have been read
Christian Heimesd8654cf2007-12-02 15:22:16 +00001654 self.offset = self.fileobj.tell()
1655 # current position in the archive file
Thomas Wouters477c8d52006-05-27 19:21:47 +00001656 self.inodes = {} # dictionary caching the inodes of
1657 # archive members already added
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001658
Lars Gustäbel7b465392009-11-18 20:29:25 +00001659 try:
1660 if self.mode == "r":
1661 self.firstmember = None
1662 self.firstmember = self.next()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001663
Lars Gustäbel7b465392009-11-18 20:29:25 +00001664 if self.mode == "a":
1665 # Move to the end of the archive,
1666 # before the first empty block.
Lars Gustäbel7b465392009-11-18 20:29:25 +00001667 while True:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001668 self.fileobj.seek(self.offset)
1669 try:
1670 tarinfo = self.tarinfo.fromtarfile(self)
1671 self.members.append(tarinfo)
1672 except EOFHeaderError:
1673 self.fileobj.seek(self.offset)
Lars Gustäbel7b465392009-11-18 20:29:25 +00001674 break
Lars Gustäbel9520a432009-11-22 18:48:49 +00001675 except HeaderError as e:
1676 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001677
Lars Gustäbel7b465392009-11-18 20:29:25 +00001678 if self.mode in "aw":
1679 self._loaded = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001680
Lars Gustäbel7b465392009-11-18 20:29:25 +00001681 if self.pax_headers:
1682 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1683 self.fileobj.write(buf)
1684 self.offset += len(buf)
1685 except:
1686 if not self._extfileobj:
1687 self.fileobj.close()
1688 self.closed = True
1689 raise
Guido van Rossumd8faa362007-04-27 19:54:29 +00001690
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001691 #--------------------------------------------------------------------------
1692 # Below are the classmethods which act as alternate constructors to the
1693 # TarFile class. The open() method is the only one that is needed for
1694 # public use; it is the "super"-constructor and is able to select an
1695 # adequate "sub"-constructor for a particular compression using the mapping
1696 # from OPEN_METH.
1697 #
1698 # This concept allows one to subclass TarFile without losing the comfort of
1699 # the super-constructor. A sub-constructor is registered and made available
1700 # by adding it to the mapping in OPEN_METH.
1701
Guido van Rossum75b64e62005-01-16 00:16:11 +00001702 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001703 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001704 """Open a tar archive for reading, writing or appending. Return
1705 an appropriate TarFile class.
1706
1707 mode:
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001708 'r' or 'r:*' open for reading with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001709 'r:' open for reading exclusively uncompressed
1710 'r:gz' open for reading with gzip compression
1711 'r:bz2' open for reading with bzip2 compression
Thomas Wouterscf297e42007-02-23 15:07:44 +00001712 'a' or 'a:' open for appending, creating the file if necessary
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001713 'w' or 'w:' open for writing without compression
1714 'w:gz' open for writing with gzip compression
1715 'w:bz2' open for writing with bzip2 compression
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001716
1717 'r|*' open a stream of tar blocks with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001718 'r|' open an uncompressed stream of tar blocks for reading
1719 'r|gz' open a gzip compressed stream of tar blocks
1720 'r|bz2' open a bzip2 compressed stream of tar blocks
1721 'w|' open an uncompressed stream for writing
1722 'w|gz' open a gzip compressed stream for writing
1723 'w|bz2' open a bzip2 compressed stream for writing
1724 """
1725
1726 if not name and not fileobj:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001727 raise ValueError("nothing to open")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001728
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001729 if mode in ("r", "r:*"):
1730 # Find out which *open() is appropriate for opening the file.
1731 for comptype in cls.OPEN_METH:
1732 func = getattr(cls, cls.OPEN_METH[comptype])
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001733 if fileobj is not None:
1734 saved_pos = fileobj.tell()
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001735 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001736 return func(name, "r", fileobj, **kwargs)
1737 except (ReadError, CompressionError) as e:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001738 if fileobj is not None:
1739 fileobj.seek(saved_pos)
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001740 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001741 raise ReadError("file could not be opened successfully")
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001742
1743 elif ":" in mode:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001744 filemode, comptype = mode.split(":", 1)
1745 filemode = filemode or "r"
1746 comptype = comptype or "tar"
1747
1748 # Select the *open() function according to
1749 # given compression.
1750 if comptype in cls.OPEN_METH:
1751 func = getattr(cls, cls.OPEN_METH[comptype])
1752 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001753 raise CompressionError("unknown compression type %r" % comptype)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001754 return func(name, filemode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001755
1756 elif "|" in mode:
1757 filemode, comptype = mode.split("|", 1)
1758 filemode = filemode or "r"
1759 comptype = comptype or "tar"
1760
1761 if filemode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001762 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001763
Antoine Pitrou605c2932010-09-23 20:15:14 +00001764 stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1765 try:
1766 t = cls(name, filemode, stream, **kwargs)
1767 except:
1768 stream.close()
1769 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001770 t._extfileobj = False
1771 return t
1772
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001773 elif mode in "aw":
Guido van Rossumd8faa362007-04-27 19:54:29 +00001774 return cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001775
Thomas Wouters477c8d52006-05-27 19:21:47 +00001776 raise ValueError("undiscernible mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001777
Guido van Rossum75b64e62005-01-16 00:16:11 +00001778 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001779 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001780 """Open uncompressed tar archive name for reading or writing.
1781 """
1782 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001783 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001784 return cls(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001785
Guido van Rossum75b64e62005-01-16 00:16:11 +00001786 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001787 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001788 """Open gzip compressed tar archive name for reading or writing.
1789 Appending is not allowed.
1790 """
1791 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001792 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001793
1794 try:
1795 import gzip
Neal Norwitz4ec68242003-04-11 03:05:56 +00001796 gzip.GzipFile
1797 except (ImportError, AttributeError):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001798 raise CompressionError("gzip module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001799
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001800 extfileobj = fileobj is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001801 try:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001802 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
1803 t = cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001804 except IOError:
Georg Brandl3abb3722011-08-13 11:48:12 +02001805 if not extfileobj and fileobj is not None:
Antoine Pitrou95f55602010-09-23 18:36:46 +00001806 fileobj.close()
Georg Brandl3abb3722011-08-13 11:48:12 +02001807 if fileobj is None:
1808 raise
Thomas Wouters477c8d52006-05-27 19:21:47 +00001809 raise ReadError("not a gzip file")
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001810 except:
Georg Brandl3abb3722011-08-13 11:48:12 +02001811 if not extfileobj and fileobj is not None:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001812 fileobj.close()
1813 raise
Antoine Pitrou95f55602010-09-23 18:36:46 +00001814 t._extfileobj = extfileobj
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001815 return t
1816
Guido van Rossum75b64e62005-01-16 00:16:11 +00001817 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001818 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001819 """Open bzip2 compressed tar archive name for reading or writing.
1820 Appending is not allowed.
1821 """
1822 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001823 raise ValueError("mode must be 'r' or 'w'.")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001824
1825 try:
1826 import bz2
1827 except ImportError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001828 raise CompressionError("bz2 module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001829
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001830 if fileobj is not None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001831 fileobj = _BZ2Proxy(fileobj, mode)
1832 else:
1833 fileobj = bz2.BZ2File(name, mode, compresslevel=compresslevel)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001834
1835 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001836 t = cls.taropen(name, mode, fileobj, **kwargs)
Lars Gustäbel9520a432009-11-22 18:48:49 +00001837 except (IOError, EOFError):
Antoine Pitrou95f55602010-09-23 18:36:46 +00001838 fileobj.close()
Thomas Wouters477c8d52006-05-27 19:21:47 +00001839 raise ReadError("not a bzip2 file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001840 t._extfileobj = False
1841 return t
1842
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001843 # All *open() methods are registered here.
1844 OPEN_METH = {
1845 "tar": "taropen", # uncompressed tar
1846 "gz": "gzopen", # gzip compressed tar
1847 "bz2": "bz2open" # bzip2 compressed tar
1848 }
1849
1850 #--------------------------------------------------------------------------
1851 # The public methods which TarFile provides:
1852
1853 def close(self):
1854 """Close the TarFile. In write-mode, two finishing zero blocks are
1855 appended to the archive.
1856 """
1857 if self.closed:
1858 return
1859
Guido van Rossumd8faa362007-04-27 19:54:29 +00001860 if self.mode in "aw":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001861 self.fileobj.write(NUL * (BLOCKSIZE * 2))
1862 self.offset += (BLOCKSIZE * 2)
1863 # fill up the end with zero-blocks
1864 # (like option -b20 for tar does)
1865 blocks, remainder = divmod(self.offset, RECORDSIZE)
1866 if remainder > 0:
1867 self.fileobj.write(NUL * (RECORDSIZE - remainder))
1868
1869 if not self._extfileobj:
1870 self.fileobj.close()
1871 self.closed = True
1872
1873 def getmember(self, name):
1874 """Return a TarInfo object for member `name'. If `name' can not be
1875 found in the archive, KeyError is raised. If a member occurs more
Mark Dickinson934896d2009-02-21 20:59:32 +00001876 than once in the archive, its last occurrence is assumed to be the
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001877 most up-to-date version.
1878 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001879 tarinfo = self._getmember(name)
1880 if tarinfo is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001881 raise KeyError("filename %r not found" % name)
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001882 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001883
1884 def getmembers(self):
1885 """Return the members of the archive as a list of TarInfo objects. The
1886 list has the same order as the members in the archive.
1887 """
1888 self._check()
1889 if not self._loaded: # if we want to obtain a list of
1890 self._load() # all members, we first have to
1891 # scan the whole archive.
1892 return self.members
1893
1894 def getnames(self):
1895 """Return the members of the archive as a list of their names. It has
1896 the same order as the list returned by getmembers().
1897 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001898 return [tarinfo.name for tarinfo in self.getmembers()]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001899
1900 def gettarinfo(self, name=None, arcname=None, fileobj=None):
1901 """Create a TarInfo object for either the file `name' or the file
1902 object `fileobj' (using os.fstat on its file descriptor). You can
1903 modify some of the TarInfo's attributes before you add it using
1904 addfile(). If given, `arcname' specifies an alternative name for the
1905 file in the archive.
1906 """
1907 self._check("aw")
1908
1909 # When fileobj is given, replace name by
1910 # fileobj's real name.
1911 if fileobj is not None:
1912 name = fileobj.name
1913
1914 # Building the name of the member in the archive.
1915 # Backward slashes are converted to forward slashes,
1916 # Absolute paths are turned to relative paths.
1917 if arcname is None:
1918 arcname = name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001919 drv, arcname = os.path.splitdrive(arcname)
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00001920 arcname = arcname.replace(os.sep, "/")
1921 arcname = arcname.lstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001922
1923 # Now, fill the TarInfo object with
1924 # information specific for the file.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001925 tarinfo = self.tarinfo()
1926 tarinfo.tarfile = self
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001927
1928 # Use os.stat or os.lstat, depending on platform
1929 # and if symlinks shall be resolved.
1930 if fileobj is None:
1931 if hasattr(os, "lstat") and not self.dereference:
1932 statres = os.lstat(name)
1933 else:
1934 statres = os.stat(name)
1935 else:
1936 statres = os.fstat(fileobj.fileno())
1937 linkname = ""
1938
1939 stmd = statres.st_mode
1940 if stat.S_ISREG(stmd):
1941 inode = (statres.st_ino, statres.st_dev)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001942 if not self.dereference and statres.st_nlink > 1 and \
1943 inode in self.inodes and arcname != self.inodes[inode]:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001944 # Is it a hardlink to an already
1945 # archived file?
1946 type = LNKTYPE
1947 linkname = self.inodes[inode]
1948 else:
1949 # The inode is added only if its valid.
1950 # For win32 it is always 0.
1951 type = REGTYPE
1952 if inode[0]:
1953 self.inodes[inode] = arcname
1954 elif stat.S_ISDIR(stmd):
1955 type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001956 elif stat.S_ISFIFO(stmd):
1957 type = FIFOTYPE
1958 elif stat.S_ISLNK(stmd):
1959 type = SYMTYPE
1960 linkname = os.readlink(name)
1961 elif stat.S_ISCHR(stmd):
1962 type = CHRTYPE
1963 elif stat.S_ISBLK(stmd):
1964 type = BLKTYPE
1965 else:
1966 return None
1967
1968 # Fill the TarInfo object with all
1969 # information we can get.
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001970 tarinfo.name = arcname
1971 tarinfo.mode = stmd
1972 tarinfo.uid = statres.st_uid
1973 tarinfo.gid = statres.st_gid
Lars Gustäbel2470ff12010-06-03 10:11:52 +00001974 if type == REGTYPE:
Martin v. Löwis61d77e02004-08-20 06:35:46 +00001975 tarinfo.size = statres.st_size
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001976 else:
Guido van Rossume2a383d2007-01-15 16:59:06 +00001977 tarinfo.size = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001978 tarinfo.mtime = statres.st_mtime
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001979 tarinfo.type = type
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001980 tarinfo.linkname = linkname
1981 if pwd:
1982 try:
1983 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1984 except KeyError:
1985 pass
1986 if grp:
1987 try:
1988 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1989 except KeyError:
1990 pass
1991
1992 if type in (CHRTYPE, BLKTYPE):
1993 if hasattr(os, "major") and hasattr(os, "minor"):
1994 tarinfo.devmajor = os.major(statres.st_rdev)
1995 tarinfo.devminor = os.minor(statres.st_rdev)
1996 return tarinfo
1997
1998 def list(self, verbose=True):
1999 """Print a table of contents to sys.stdout. If `verbose' is False, only
2000 the names of the members are printed. If it is True, an `ls -l'-like
2001 output is produced.
2002 """
2003 self._check()
2004
2005 for tarinfo in self:
2006 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002007 print(filemode(tarinfo.mode), end=' ')
2008 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2009 tarinfo.gname or tarinfo.gid), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002010 if tarinfo.ischr() or tarinfo.isblk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002011 print("%10s" % ("%d,%d" \
2012 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002013 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002014 print("%10d" % tarinfo.size, end=' ')
2015 print("%d-%02d-%02d %02d:%02d:%02d" \
2016 % time.localtime(tarinfo.mtime)[:6], end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002017
Guido van Rossumd8faa362007-04-27 19:54:29 +00002018 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002019
2020 if verbose:
2021 if tarinfo.issym():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002022 print("->", tarinfo.linkname, end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002023 if tarinfo.islnk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002024 print("link to", tarinfo.linkname, end=' ')
2025 print()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002026
Raymond Hettingera63a3122011-01-26 20:34:14 +00002027 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002028 """Add the file `name' to the archive. `name' may be any type of file
2029 (directory, fifo, symbolic link, etc.). If given, `arcname'
2030 specifies an alternative name for the file in the archive.
2031 Directories are added recursively by default. This can be avoided by
Guido van Rossum486364b2007-06-30 05:01:58 +00002032 setting `recursive' to False. `exclude' is a function that should
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002033 return True for each filename to be excluded. `filter' is a function
2034 that expects a TarInfo object argument and returns the changed
2035 TarInfo object, if it returns None the TarInfo object will be
2036 excluded from the archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002037 """
2038 self._check("aw")
2039
2040 if arcname is None:
2041 arcname = name
2042
Guido van Rossum486364b2007-06-30 05:01:58 +00002043 # Exclude pathnames.
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002044 if exclude is not None:
2045 import warnings
2046 warnings.warn("use the filter argument instead",
2047 DeprecationWarning, 2)
2048 if exclude(name):
2049 self._dbg(2, "tarfile: Excluded %r" % name)
2050 return
Guido van Rossum486364b2007-06-30 05:01:58 +00002051
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002052 # Skip if somebody tries to archive the archive...
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002053 if self.name is not None and os.path.abspath(name) == self.name:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002054 self._dbg(2, "tarfile: Skipped %r" % name)
2055 return
2056
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002057 self._dbg(1, name)
2058
2059 # Create a TarInfo object from the file.
2060 tarinfo = self.gettarinfo(name, arcname)
2061
2062 if tarinfo is None:
2063 self._dbg(1, "tarfile: Unsupported type %r" % name)
2064 return
2065
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002066 # Change or exclude the TarInfo object.
2067 if filter is not None:
2068 tarinfo = filter(tarinfo)
2069 if tarinfo is None:
2070 self._dbg(2, "tarfile: Excluded %r" % name)
2071 return
2072
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002073 # Append the tar header and data to the archive.
2074 if tarinfo.isreg():
Guido van Rossume7ba4952007-06-06 23:52:48 +00002075 f = bltn_open(name, "rb")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002076 self.addfile(tarinfo, f)
2077 f.close()
2078
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002079 elif tarinfo.isdir():
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002080 self.addfile(tarinfo)
2081 if recursive:
2082 for f in os.listdir(name):
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002083 self.add(os.path.join(name, f), os.path.join(arcname, f),
Raymond Hettingera63a3122011-01-26 20:34:14 +00002084 recursive, exclude, filter=filter)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002085
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002086 else:
2087 self.addfile(tarinfo)
2088
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002089 def addfile(self, tarinfo, fileobj=None):
2090 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2091 given, tarinfo.size bytes are read from it and added to the archive.
2092 You can create TarInfo objects using gettarinfo().
2093 On Windows platforms, `fileobj' should always be opened with mode
2094 'rb' to avoid irritation about the file size.
2095 """
2096 self._check("aw")
2097
Thomas Wouters89f507f2006-12-13 04:49:30 +00002098 tarinfo = copy.copy(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002099
Guido van Rossume7ba4952007-06-06 23:52:48 +00002100 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002101 self.fileobj.write(buf)
2102 self.offset += len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002103
2104 # If there's data to follow, append it.
2105 if fileobj is not None:
2106 copyfileobj(fileobj, self.fileobj, tarinfo.size)
2107 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
2108 if remainder > 0:
2109 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2110 blocks += 1
2111 self.offset += blocks * BLOCKSIZE
2112
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002113 self.members.append(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002114
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002115 def extractall(self, path=".", members=None):
2116 """Extract all members from the archive to the current working
2117 directory and set owner, modification time and permissions on
2118 directories afterwards. `path' specifies a different directory
2119 to extract to. `members' is optional and must be a subset of the
2120 list returned by getmembers().
2121 """
2122 directories = []
2123
2124 if members is None:
2125 members = self
2126
2127 for tarinfo in members:
2128 if tarinfo.isdir():
Christian Heimes2202f872008-02-06 14:31:34 +00002129 # Extract directories with a safe mode.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002130 directories.append(tarinfo)
Christian Heimes2202f872008-02-06 14:31:34 +00002131 tarinfo = copy.copy(tarinfo)
2132 tarinfo.mode = 0o700
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002133 # Do not set_attrs directories, as we will do that further down
2134 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002135
2136 # Reverse sort directories.
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +00002137 directories.sort(key=lambda a: a.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002138 directories.reverse()
2139
2140 # Set correct owner, mtime and filemode on directories.
2141 for tarinfo in directories:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002142 dirpath = os.path.join(path, tarinfo.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002143 try:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002144 self.chown(tarinfo, dirpath)
2145 self.utime(tarinfo, dirpath)
2146 self.chmod(tarinfo, dirpath)
Guido van Rossumb940e112007-01-10 16:19:56 +00002147 except ExtractError as e:
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002148 if self.errorlevel > 1:
2149 raise
2150 else:
2151 self._dbg(1, "tarfile: %s" % e)
2152
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002153 def extract(self, member, path="", set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002154 """Extract a member from the archive to the current working directory,
2155 using its full name. Its file information is extracted as accurately
2156 as possible. `member' may be a filename or a TarInfo object. You can
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002157 specify a different directory using `path'. File attributes (owner,
2158 mtime, mode) are set unless `set_attrs' is False.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002159 """
2160 self._check("r")
2161
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002162 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002163 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002164 else:
2165 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002166
Neal Norwitza4f651a2004-07-20 22:07:44 +00002167 # Prepare the link target for makelink().
2168 if tarinfo.islnk():
2169 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2170
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002171 try:
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002172 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2173 set_attrs=set_attrs)
Guido van Rossumb940e112007-01-10 16:19:56 +00002174 except EnvironmentError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002175 if self.errorlevel > 0:
2176 raise
2177 else:
2178 if e.filename is None:
2179 self._dbg(1, "tarfile: %s" % e.strerror)
2180 else:
2181 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
Guido van Rossumb940e112007-01-10 16:19:56 +00002182 except ExtractError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002183 if self.errorlevel > 1:
2184 raise
2185 else:
2186 self._dbg(1, "tarfile: %s" % e)
2187
2188 def extractfile(self, member):
2189 """Extract a member from the archive as a file object. `member' may be
2190 a filename or a TarInfo object. If `member' is a regular file, a
2191 file-like object is returned. If `member' is a link, a file-like
2192 object is constructed from the link's target. If `member' is none of
2193 the above, None is returned.
2194 The file-like object is read-only and provides the following
2195 methods: read(), readline(), readlines(), seek() and tell()
2196 """
2197 self._check("r")
2198
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002199 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002200 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002201 else:
2202 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002203
2204 if tarinfo.isreg():
2205 return self.fileobject(self, tarinfo)
2206
2207 elif tarinfo.type not in SUPPORTED_TYPES:
2208 # If a member's type is unknown, it is treated as a
2209 # regular file.
2210 return self.fileobject(self, tarinfo)
2211
2212 elif tarinfo.islnk() or tarinfo.issym():
2213 if isinstance(self.fileobj, _Stream):
2214 # A small but ugly workaround for the case that someone tries
2215 # to extract a (sym)link as a file-object from a non-seekable
2216 # stream of tar blocks.
Thomas Wouters477c8d52006-05-27 19:21:47 +00002217 raise StreamError("cannot extract (sym)link as file object")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002218 else:
Georg Brandl7eb4b7d2005-07-22 21:49:32 +00002219 # A (sym)link's file object is its target's file object.
Lars Gustäbel1b512722010-06-03 12:45:16 +00002220 return self.extractfile(self._find_link_target(tarinfo))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002221 else:
2222 # If there's no data associated with the member (directory, chrdev,
2223 # blkdev, etc.), return None instead of a file object.
2224 return None
2225
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002226 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002227 """Extract the TarInfo object tarinfo to a physical
2228 file called targetpath.
2229 """
2230 # Fetch the TarInfo object for the given name
2231 # and build the destination pathname, replacing
2232 # forward slashes to platform specific separators.
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002233 targetpath = targetpath.rstrip("/")
2234 targetpath = targetpath.replace("/", os.sep)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002235
2236 # Create all upper directories.
2237 upperdirs = os.path.dirname(targetpath)
2238 if upperdirs and not os.path.exists(upperdirs):
Christian Heimes2202f872008-02-06 14:31:34 +00002239 # Create directories that are not part of the archive with
2240 # default permissions.
Thomas Woutersb2137042007-02-01 18:02:27 +00002241 os.makedirs(upperdirs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002242
2243 if tarinfo.islnk() or tarinfo.issym():
2244 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2245 else:
2246 self._dbg(1, tarinfo.name)
2247
2248 if tarinfo.isreg():
2249 self.makefile(tarinfo, targetpath)
2250 elif tarinfo.isdir():
2251 self.makedir(tarinfo, targetpath)
2252 elif tarinfo.isfifo():
2253 self.makefifo(tarinfo, targetpath)
2254 elif tarinfo.ischr() or tarinfo.isblk():
2255 self.makedev(tarinfo, targetpath)
2256 elif tarinfo.islnk() or tarinfo.issym():
2257 self.makelink(tarinfo, targetpath)
2258 elif tarinfo.type not in SUPPORTED_TYPES:
2259 self.makeunknown(tarinfo, targetpath)
2260 else:
2261 self.makefile(tarinfo, targetpath)
2262
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002263 if set_attrs:
2264 self.chown(tarinfo, targetpath)
2265 if not tarinfo.issym():
2266 self.chmod(tarinfo, targetpath)
2267 self.utime(tarinfo, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002268
2269 #--------------------------------------------------------------------------
2270 # Below are the different file methods. They are called via
2271 # _extract_member() when extract() is called. They can be replaced in a
2272 # subclass to implement other functionality.
2273
2274 def makedir(self, tarinfo, targetpath):
2275 """Make a directory called targetpath.
2276 """
2277 try:
Christian Heimes2202f872008-02-06 14:31:34 +00002278 # Use a safe mode for the directory, the real mode is set
2279 # later in _extract_member().
2280 os.mkdir(targetpath, 0o700)
Guido van Rossumb940e112007-01-10 16:19:56 +00002281 except EnvironmentError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002282 if e.errno != errno.EEXIST:
2283 raise
2284
2285 def makefile(self, tarinfo, targetpath):
2286 """Make a file called targetpath.
2287 """
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002288 source = self.fileobj
2289 source.seek(tarinfo.offset_data)
Guido van Rossume7ba4952007-06-06 23:52:48 +00002290 target = bltn_open(targetpath, "wb")
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002291 if tarinfo.sparse is not None:
2292 for offset, size in tarinfo.sparse:
2293 target.seek(offset)
2294 copyfileobj(source, target, size)
2295 else:
2296 copyfileobj(source, target, tarinfo.size)
2297 target.seek(tarinfo.size)
2298 target.truncate()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002299 target.close()
2300
2301 def makeunknown(self, tarinfo, targetpath):
2302 """Make a file from a TarInfo object with an unknown type
2303 at targetpath.
2304 """
2305 self.makefile(tarinfo, targetpath)
2306 self._dbg(1, "tarfile: Unknown file type %r, " \
2307 "extracted as regular file." % tarinfo.type)
2308
2309 def makefifo(self, tarinfo, targetpath):
2310 """Make a fifo called targetpath.
2311 """
2312 if hasattr(os, "mkfifo"):
2313 os.mkfifo(targetpath)
2314 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002315 raise ExtractError("fifo not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002316
2317 def makedev(self, tarinfo, targetpath):
2318 """Make a character or block device called targetpath.
2319 """
2320 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00002321 raise ExtractError("special devices not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002322
2323 mode = tarinfo.mode
2324 if tarinfo.isblk():
2325 mode |= stat.S_IFBLK
2326 else:
2327 mode |= stat.S_IFCHR
2328
2329 os.mknod(targetpath, mode,
2330 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2331
2332 def makelink(self, tarinfo, targetpath):
2333 """Make a (symbolic) link called targetpath. If it cannot be created
2334 (platform limitation), we try to make a copy of the referenced file
2335 instead of a link.
2336 """
Brian Curtind40e6f72010-07-08 21:39:08 +00002337 try:
Lars Gustäbel1b512722010-06-03 12:45:16 +00002338 # For systems that support symbolic and hard links.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002339 if tarinfo.issym():
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002340 os.symlink(tarinfo.linkname, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002341 else:
Neal Norwitza4f651a2004-07-20 22:07:44 +00002342 # See extract().
Lars Gustäbel1b512722010-06-03 12:45:16 +00002343 if os.path.exists(tarinfo._link_target):
2344 os.link(tarinfo._link_target, targetpath)
2345 else:
Brian Curtin82df53e2010-09-24 21:04:05 +00002346 self._extract_member(self._find_link_target(tarinfo),
2347 targetpath)
Brian Curtin16633fa2010-07-09 13:54:27 +00002348 except symlink_exception:
Brian Curtind40e6f72010-07-08 21:39:08 +00002349 if tarinfo.issym():
Brian Curtin16633fa2010-07-09 13:54:27 +00002350 linkpath = os.path.join(os.path.dirname(tarinfo.name),
2351 tarinfo.linkname)
Brian Curtind40e6f72010-07-08 21:39:08 +00002352 else:
2353 linkpath = tarinfo.linkname
Lars Gustäbel1b512722010-06-03 12:45:16 +00002354 else:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002355 try:
Brian Curtin16633fa2010-07-09 13:54:27 +00002356 self._extract_member(self._find_link_target(tarinfo),
2357 targetpath)
Lars Gustäbel1b512722010-06-03 12:45:16 +00002358 except KeyError:
2359 raise ExtractError("unable to resolve link inside archive")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002360
2361 def chown(self, tarinfo, targetpath):
2362 """Set owner of targetpath according to tarinfo.
2363 """
2364 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2365 # We have to be root to do so.
2366 try:
2367 g = grp.getgrnam(tarinfo.gname)[2]
2368 except KeyError:
2369 try:
2370 g = grp.getgrgid(tarinfo.gid)[2]
2371 except KeyError:
2372 g = os.getgid()
2373 try:
2374 u = pwd.getpwnam(tarinfo.uname)[2]
2375 except KeyError:
2376 try:
2377 u = pwd.getpwuid(tarinfo.uid)[2]
2378 except KeyError:
2379 u = os.getuid()
2380 try:
2381 if tarinfo.issym() and hasattr(os, "lchown"):
2382 os.lchown(targetpath, u, g)
2383 else:
Andrew MacIntyre7970d202003-02-19 12:51:34 +00002384 if sys.platform != "os2emx":
2385 os.chown(targetpath, u, g)
Guido van Rossumb940e112007-01-10 16:19:56 +00002386 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002387 raise ExtractError("could not change owner")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002388
2389 def chmod(self, tarinfo, targetpath):
2390 """Set file permissions of targetpath according to tarinfo.
2391 """
Jack Jansen834eff62003-03-07 12:47:06 +00002392 if hasattr(os, 'chmod'):
2393 try:
2394 os.chmod(targetpath, tarinfo.mode)
Guido van Rossumb940e112007-01-10 16:19:56 +00002395 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002396 raise ExtractError("could not change mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002397
2398 def utime(self, tarinfo, targetpath):
2399 """Set modification time of targetpath according to tarinfo.
2400 """
Jack Jansen834eff62003-03-07 12:47:06 +00002401 if not hasattr(os, 'utime'):
Tim Petersf9347782003-03-07 15:36:41 +00002402 return
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002403 try:
2404 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
Guido van Rossumb940e112007-01-10 16:19:56 +00002405 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002406 raise ExtractError("could not change modification time")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002407
2408 #--------------------------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002409 def next(self):
2410 """Return the next member of the archive as a TarInfo object, when
2411 TarFile is opened for reading. Return None if there is no more
2412 available.
2413 """
2414 self._check("ra")
2415 if self.firstmember is not None:
2416 m = self.firstmember
2417 self.firstmember = None
2418 return m
2419
2420 # Read the next block.
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002421 self.fileobj.seek(self.offset)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002422 tarinfo = None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002423 while True:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002424 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00002425 tarinfo = self.tarinfo.fromtarfile(self)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002426 except EOFHeaderError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002427 if self.ignore_zeros:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002428 self._dbg(2, "0x%X: %s" % (self.offset, e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002429 self.offset += BLOCKSIZE
2430 continue
Lars Gustäbel9520a432009-11-22 18:48:49 +00002431 except InvalidHeaderError as e:
2432 if self.ignore_zeros:
2433 self._dbg(2, "0x%X: %s" % (self.offset, e))
2434 self.offset += BLOCKSIZE
2435 continue
2436 elif self.offset == 0:
2437 raise ReadError(str(e))
2438 except EmptyHeaderError:
2439 if self.offset == 0:
2440 raise ReadError("empty file")
2441 except TruncatedHeaderError as e:
2442 if self.offset == 0:
2443 raise ReadError(str(e))
2444 except SubsequentHeaderError as e:
2445 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002446 break
2447
Lars Gustäbel9520a432009-11-22 18:48:49 +00002448 if tarinfo is not None:
2449 self.members.append(tarinfo)
2450 else:
2451 self._loaded = True
2452
Thomas Wouters477c8d52006-05-27 19:21:47 +00002453 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002454
2455 #--------------------------------------------------------------------------
2456 # Little helper methods:
2457
Lars Gustäbel1b512722010-06-03 12:45:16 +00002458 def _getmember(self, name, tarinfo=None, normalize=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002459 """Find an archive member by name from bottom to top.
2460 If tarinfo is given, it is used as the starting point.
2461 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002462 # Ensure that all members have been loaded.
2463 members = self.getmembers()
2464
Lars Gustäbel1b512722010-06-03 12:45:16 +00002465 # Limit the member search list up to tarinfo.
2466 if tarinfo is not None:
2467 members = members[:members.index(tarinfo)]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002468
Lars Gustäbel1b512722010-06-03 12:45:16 +00002469 if normalize:
2470 name = os.path.normpath(name)
2471
2472 for member in reversed(members):
2473 if normalize:
2474 member_name = os.path.normpath(member.name)
2475 else:
2476 member_name = member.name
2477
2478 if name == member_name:
2479 return member
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002480
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002481 def _load(self):
2482 """Read through the entire archive file and look for readable
2483 members.
2484 """
2485 while True:
2486 tarinfo = self.next()
2487 if tarinfo is None:
2488 break
2489 self._loaded = True
2490
2491 def _check(self, mode=None):
2492 """Check if TarFile is still open, and if the operation's mode
2493 corresponds to TarFile's mode.
2494 """
2495 if self.closed:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002496 raise IOError("%s is closed" % self.__class__.__name__)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002497 if mode is not None and self.mode not in mode:
2498 raise IOError("bad operation for mode %r" % self.mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002499
Lars Gustäbel1b512722010-06-03 12:45:16 +00002500 def _find_link_target(self, tarinfo):
2501 """Find the target member of a symlink or hardlink member in the
2502 archive.
2503 """
2504 if tarinfo.issym():
2505 # Always search the entire archive.
2506 linkname = os.path.dirname(tarinfo.name) + "/" + tarinfo.linkname
2507 limit = None
2508 else:
2509 # Search the archive before the link, because a hard link is
2510 # just a reference to an already archived file.
2511 linkname = tarinfo.linkname
2512 limit = tarinfo
2513
2514 member = self._getmember(linkname, tarinfo=limit, normalize=True)
2515 if member is None:
2516 raise KeyError("linkname %r not found" % linkname)
2517 return member
2518
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002519 def __iter__(self):
2520 """Provide an iterator object.
2521 """
2522 if self._loaded:
2523 return iter(self.members)
2524 else:
2525 return TarIter(self)
2526
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002527 def _dbg(self, level, msg):
2528 """Write debugging output to sys.stderr.
2529 """
2530 if level <= self.debug:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002531 print(msg, file=sys.stderr)
Lars Gustäbel01385812010-03-03 12:08:54 +00002532
2533 def __enter__(self):
2534 self._check()
2535 return self
2536
2537 def __exit__(self, type, value, traceback):
2538 if type is None:
2539 self.close()
2540 else:
2541 # An exception occurred. We must not call close() because
2542 # it would try to write end-of-archive blocks and padding.
2543 if not self._extfileobj:
2544 self.fileobj.close()
2545 self.closed = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002546# class TarFile
2547
2548class TarIter:
2549 """Iterator Class.
2550
2551 for tarinfo in TarFile(...):
2552 suite...
2553 """
2554
2555 def __init__(self, tarfile):
2556 """Construct a TarIter object.
2557 """
2558 self.tarfile = tarfile
Martin v. Löwis637431b2005-03-03 23:12:42 +00002559 self.index = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002560 def __iter__(self):
2561 """Return iterator object.
2562 """
2563 return self
Georg Brandla18af4e2007-04-21 15:47:16 +00002564 def __next__(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002565 """Return the next item using TarFile's next() method.
2566 When all members have been read, set TarFile as _loaded.
2567 """
Martin v. Löwis637431b2005-03-03 23:12:42 +00002568 # Fix for SF #1100429: Under rare circumstances it can
2569 # happen that getmembers() is called during iteration,
2570 # which will cause TarIter to stop prematurely.
2571 if not self.tarfile._loaded:
2572 tarinfo = self.tarfile.next()
2573 if not tarinfo:
2574 self.tarfile._loaded = True
2575 raise StopIteration
2576 else:
2577 try:
2578 tarinfo = self.tarfile.members[self.index]
2579 except IndexError:
2580 raise StopIteration
2581 self.index += 1
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002582 return tarinfo
2583
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002584#--------------------
2585# exported functions
2586#--------------------
2587def is_tarfile(name):
2588 """Return True if name points to a tar archive that we
2589 are able to handle, else return False.
2590 """
2591 try:
2592 t = open(name)
2593 t.close()
2594 return True
2595 except TarError:
2596 return False
2597
Guido van Rossume7ba4952007-06-06 23:52:48 +00002598bltn_open = open
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002599open = TarFile.open