blob: 1789828d31a805dc5a9aad33c67040f5bbb97633 [file] [log] [blame]
Benjamin Peterson90f5ba52010-03-11 22:53:45 +00001#!/usr/bin/env python3
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
Christian Heimes9c1257e2007-11-04 11:37:22 +00005# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00006# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
Guido van Rossumd8faa362007-04-27 19:54:29 +000032version = "0.9.0"
Guido van Rossum98297ee2007-11-06 21:34:58 +000033__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)"
Senthil Kumaran7c9719c2011-07-28 22:32:49 +080034__date__ = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $"
35__cvsid__ = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $"
Guido van Rossum98297ee2007-11-06 21:34:58 +000036__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000037
38#---------
39# Imports
40#---------
41import sys
42import os
43import shutil
44import stat
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000045import time
46import struct
Thomas Wouters89f507f2006-12-13 04:49:30 +000047import copy
Guido van Rossumd8faa362007-04-27 19:54:29 +000048import re
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000049
50try:
51 import grp, pwd
52except ImportError:
53 grp = pwd = None
54
Brian Curtin16633fa2010-07-09 13:54:27 +000055# os.symlink on Windows prior to 6.0 raises NotImplementedError
56symlink_exception = (AttributeError, NotImplementedError)
57try:
58 # WindowsError (1314) will be raised if the caller does not hold the
59 # SeCreateSymbolicLinkPrivilege privilege
60 symlink_exception += (WindowsError,)
61except NameError:
62 pass
63
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000064# from tarfile import *
65__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
66
Georg Brandl1a3284e2007-12-02 09:40:06 +000067from builtins import open as _open # Since 'open' is TarFile.open
Guido van Rossum8f78fe92006-08-24 04:03:53 +000068
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000069#---------------------------------------------------------
70# tar constants
71#---------------------------------------------------------
Lars Gustäbelb506dc32007-08-07 18:36:16 +000072NUL = b"\0" # the null character
Guido van Rossumd8faa362007-04-27 19:54:29 +000073BLOCKSIZE = 512 # length of processing blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000074RECORDSIZE = BLOCKSIZE * 20 # length of records
Lars Gustäbelb506dc32007-08-07 18:36:16 +000075GNU_MAGIC = b"ustar \0" # magic gnu tar string
76POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000077
Guido van Rossumd8faa362007-04-27 19:54:29 +000078LENGTH_NAME = 100 # maximum length of a filename
79LENGTH_LINK = 100 # maximum length of a linkname
80LENGTH_PREFIX = 155 # maximum length of the prefix field
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000081
Lars Gustäbelb506dc32007-08-07 18:36:16 +000082REGTYPE = b"0" # regular file
83AREGTYPE = b"\0" # regular file
84LNKTYPE = b"1" # link (inside tarfile)
85SYMTYPE = b"2" # symbolic link
86CHRTYPE = b"3" # character special device
87BLKTYPE = b"4" # block special device
88DIRTYPE = b"5" # directory
89FIFOTYPE = b"6" # fifo special device
90CONTTYPE = b"7" # contiguous file
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000091
Lars Gustäbelb506dc32007-08-07 18:36:16 +000092GNUTYPE_LONGNAME = b"L" # GNU tar longname
93GNUTYPE_LONGLINK = b"K" # GNU tar longlink
94GNUTYPE_SPARSE = b"S" # GNU tar sparse file
Guido van Rossumd8faa362007-04-27 19:54:29 +000095
Lars Gustäbelb506dc32007-08-07 18:36:16 +000096XHDTYPE = b"x" # POSIX.1-2001 extended header
97XGLTYPE = b"g" # POSIX.1-2001 global header
98SOLARIS_XHDTYPE = b"X" # Solaris extended header
Guido van Rossumd8faa362007-04-27 19:54:29 +000099
100USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
101GNU_FORMAT = 1 # GNU tar format
102PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
103DEFAULT_FORMAT = GNU_FORMAT
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000104
105#---------------------------------------------------------
106# tarfile constants
107#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000108# File types that tarfile supports:
109SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
110 SYMTYPE, DIRTYPE, FIFOTYPE,
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000111 CONTTYPE, CHRTYPE, BLKTYPE,
112 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
113 GNUTYPE_SPARSE)
114
Guido van Rossumd8faa362007-04-27 19:54:29 +0000115# File types that will be treated as a regular file.
116REGULAR_TYPES = (REGTYPE, AREGTYPE,
117 CONTTYPE, GNUTYPE_SPARSE)
118
119# File types that are part of the GNU tar format.
120GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
121 GNUTYPE_SPARSE)
122
123# Fields from a pax header that override a TarInfo attribute.
124PAX_FIELDS = ("path", "linkpath", "size", "mtime",
125 "uid", "gid", "uname", "gname")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000126
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000127# Fields from a pax header that are affected by hdrcharset.
128PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
129
Guido van Rossume7ba4952007-06-06 23:52:48 +0000130# Fields in a pax header that are numbers, all other fields
131# are treated as strings.
132PAX_NUMBER_FIELDS = {
133 "atime": float,
134 "ctime": float,
135 "mtime": float,
136 "uid": int,
137 "gid": int,
138 "size": int
139}
140
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000141#---------------------------------------------------------
142# Bits used in the mode field, values in octal.
143#---------------------------------------------------------
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000144S_IFLNK = 0o120000 # symbolic link
145S_IFREG = 0o100000 # regular file
146S_IFBLK = 0o060000 # block device
147S_IFDIR = 0o040000 # directory
148S_IFCHR = 0o020000 # character device
149S_IFIFO = 0o010000 # fifo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000150
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000151TSUID = 0o4000 # set UID on execution
152TSGID = 0o2000 # set GID on execution
153TSVTX = 0o1000 # reserved
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000154
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000155TUREAD = 0o400 # read by owner
156TUWRITE = 0o200 # write by owner
157TUEXEC = 0o100 # execute/search by owner
158TGREAD = 0o040 # read by group
159TGWRITE = 0o020 # write by group
160TGEXEC = 0o010 # execute/search by group
161TOREAD = 0o004 # read by other
162TOWRITE = 0o002 # write by other
163TOEXEC = 0o001 # execute/search by other
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000164
165#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000166# initialization
167#---------------------------------------------------------
Victor Stinner0f35e2c2010-06-11 23:46:47 +0000168if os.name in ("nt", "ce"):
169 ENCODING = "utf-8"
170else:
171 ENCODING = sys.getfilesystemencoding()
Guido van Rossumd8faa362007-04-27 19:54:29 +0000172
173#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000174# Some useful functions
175#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000176
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000177def stn(s, length, encoding, errors):
178 """Convert a string to a null-terminated bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000179 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000180 s = s.encode(encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000181 return s[:length] + (length - len(s)) * NUL
Thomas Wouters477c8d52006-05-27 19:21:47 +0000182
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000183def nts(s, encoding, errors):
184 """Convert a null-terminated bytes object to a string.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000185 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000186 p = s.find(b"\0")
187 if p != -1:
188 s = s[:p]
189 return s.decode(encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000190
Thomas Wouters477c8d52006-05-27 19:21:47 +0000191def nti(s):
192 """Convert a number field to a python number.
193 """
194 # There are two possible encodings for a number field, see
195 # itn() below.
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200196 if s[0] in (0o200, 0o377):
197 n = 0
198 for i in range(len(s) - 1):
199 n <<= 8
200 n += s[i + 1]
201 if s[0] == 0o377:
202 n = -(256 ** (len(s) - 1) - n)
203 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000204 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000205 n = int(nts(s, "ascii", "strict") or "0", 8)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000206 except ValueError:
Lars Gustäbel9520a432009-11-22 18:48:49 +0000207 raise InvalidHeaderError("invalid header")
Thomas Wouters477c8d52006-05-27 19:21:47 +0000208 return n
209
Guido van Rossumd8faa362007-04-27 19:54:29 +0000210def itn(n, digits=8, format=DEFAULT_FORMAT):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000211 """Convert a python number to a number field.
212 """
213 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
214 # octal digits followed by a null-byte, this allows values up to
215 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200216 # that if necessary. A leading 0o200 or 0o377 byte indicate this
217 # particular encoding, the following digits-1 bytes are a big-endian
218 # base-256 representation. This allows values up to (256**(digits-1))-1.
219 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
220 # number.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221 if 0 <= n < 8 ** (digits - 1):
Lars Gustäbela280ca72007-08-28 07:34:33 +0000222 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200223 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
224 if n >= 0:
225 s = bytearray([0o200])
226 else:
227 s = bytearray([0o377])
228 n = 256 ** digits + n
Thomas Wouters477c8d52006-05-27 19:21:47 +0000229
Guido van Rossum805365e2007-05-07 22:24:25 +0000230 for i in range(digits - 1):
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200231 s.insert(1, n & 0o377)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 n >>= 8
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200233 else:
234 raise ValueError("overflow in number field")
235
Thomas Wouters477c8d52006-05-27 19:21:47 +0000236 return s
237
238def calc_chksums(buf):
239 """Calculate the checksum for a member's header by summing up all
240 characters except for the chksum field which is treated as if
241 it was filled with spaces. According to the GNU tar sources,
242 some tars (Sun and NeXT) calculate chksum with signed char,
243 which will be different if there are chars in the buffer with
244 the high bit set. So we calculate two checksums, unsigned and
245 signed.
246 """
247 unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
248 signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
249 return unsigned_chksum, signed_chksum
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000250
251def copyfileobj(src, dst, length=None):
252 """Copy length bytes from fileobj src to fileobj dst.
253 If length is None, copy the entire content.
254 """
255 if length == 0:
256 return
257 if length is None:
258 shutil.copyfileobj(src, dst)
259 return
260
261 BUFSIZE = 16 * 1024
262 blocks, remainder = divmod(length, BUFSIZE)
Guido van Rossum805365e2007-05-07 22:24:25 +0000263 for b in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000264 buf = src.read(BUFSIZE)
265 if len(buf) < BUFSIZE:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000267 dst.write(buf)
268
269 if remainder != 0:
270 buf = src.read(remainder)
271 if len(buf) < remainder:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000273 dst.write(buf)
274 return
275
276filemode_table = (
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000277 ((S_IFLNK, "l"),
278 (S_IFREG, "-"),
279 (S_IFBLK, "b"),
280 (S_IFDIR, "d"),
281 (S_IFCHR, "c"),
282 (S_IFIFO, "p")),
283
284 ((TUREAD, "r"),),
285 ((TUWRITE, "w"),),
286 ((TUEXEC|TSUID, "s"),
287 (TSUID, "S"),
288 (TUEXEC, "x")),
289
290 ((TGREAD, "r"),),
291 ((TGWRITE, "w"),),
292 ((TGEXEC|TSGID, "s"),
293 (TSGID, "S"),
294 (TGEXEC, "x")),
295
296 ((TOREAD, "r"),),
297 ((TOWRITE, "w"),),
298 ((TOEXEC|TSVTX, "t"),
299 (TSVTX, "T"),
300 (TOEXEC, "x"))
301)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000302
303def filemode(mode):
304 """Convert a file's mode to a string of the form
305 -rwxrwxrwx.
306 Used by TarFile.list()
307 """
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000308 perm = []
309 for table in filemode_table:
310 for bit, char in table:
311 if mode & bit == bit:
312 perm.append(char)
313 break
314 else:
315 perm.append("-")
316 return "".join(perm)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000317
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000318class TarError(Exception):
319 """Base exception."""
320 pass
321class ExtractError(TarError):
322 """General exception for extract errors."""
323 pass
324class ReadError(TarError):
325 """Exception for unreadble tar archives."""
326 pass
327class CompressionError(TarError):
328 """Exception for unavailable compression methods."""
329 pass
330class StreamError(TarError):
331 """Exception for unsupported operations on stream-like TarFiles."""
332 pass
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000333class HeaderError(TarError):
Lars Gustäbel9520a432009-11-22 18:48:49 +0000334 """Base exception for header errors."""
335 pass
336class EmptyHeaderError(HeaderError):
337 """Exception for empty headers."""
338 pass
339class TruncatedHeaderError(HeaderError):
340 """Exception for truncated headers."""
341 pass
342class EOFHeaderError(HeaderError):
343 """Exception for end of file headers."""
344 pass
345class InvalidHeaderError(HeaderError):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000346 """Exception for invalid headers."""
347 pass
Lars Gustäbel9520a432009-11-22 18:48:49 +0000348class SubsequentHeaderError(HeaderError):
349 """Exception for missing and invalid extended headers."""
350 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000351
352#---------------------------
353# internal stream interface
354#---------------------------
355class _LowLevelFile:
356 """Low-level file object. Supports reading and writing.
357 It is used instead of a regular file object for streaming
358 access.
359 """
360
361 def __init__(self, name, mode):
362 mode = {
363 "r": os.O_RDONLY,
364 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
365 }[mode]
366 if hasattr(os, "O_BINARY"):
367 mode |= os.O_BINARY
Lars Gustäbeld6eb70b2010-04-29 15:37:02 +0000368 self.fd = os.open(name, mode, 0o666)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000369
370 def close(self):
371 os.close(self.fd)
372
373 def read(self, size):
374 return os.read(self.fd, size)
375
376 def write(self, s):
377 os.write(self.fd, s)
378
379class _Stream:
380 """Class that serves as an adapter between TarFile and
381 a stream-like object. The stream-like object only
382 needs to have a read() or write() method and is accessed
383 blockwise. Use of gzip or bzip2 compression is possible.
384 A stream-like object could be for example: sys.stdin,
385 sys.stdout, a socket, a tape device etc.
386
387 _Stream is intended to be used only internally.
388 """
389
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000390 def __init__(self, name, mode, comptype, fileobj, bufsize):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000391 """Construct a _Stream object.
392 """
393 self._extfileobj = True
394 if fileobj is None:
395 fileobj = _LowLevelFile(name, mode)
396 self._extfileobj = False
397
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000398 if comptype == '*':
399 # Enable transparent compression detection for the
400 # stream interface
401 fileobj = _StreamProxy(fileobj)
402 comptype = fileobj.getcomptype()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000403
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000404 self.name = name or ""
405 self.mode = mode
406 self.comptype = comptype
407 self.fileobj = fileobj
408 self.bufsize = bufsize
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000409 self.buf = b""
Guido van Rossume2a383d2007-01-15 16:59:06 +0000410 self.pos = 0
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000411 self.closed = False
412
Antoine Pitrou605c2932010-09-23 20:15:14 +0000413 try:
414 if comptype == "gz":
415 try:
416 import zlib
417 except ImportError:
418 raise CompressionError("zlib module is not available")
419 self.zlib = zlib
420 self.crc = zlib.crc32(b"")
421 if mode == "r":
422 self._init_read_gz()
423 else:
424 self._init_write_gz()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000425
Antoine Pitrou605c2932010-09-23 20:15:14 +0000426 if comptype == "bz2":
427 try:
428 import bz2
429 except ImportError:
430 raise CompressionError("bz2 module is not available")
431 if mode == "r":
432 self.dbuf = b""
433 self.cmp = bz2.BZ2Decompressor()
434 else:
435 self.cmp = bz2.BZ2Compressor()
436 except:
437 if not self._extfileobj:
438 self.fileobj.close()
439 self.closed = True
440 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000441
442 def __del__(self):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000443 if hasattr(self, "closed") and not self.closed:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000444 self.close()
445
446 def _init_write_gz(self):
447 """Initialize for writing with gzip compression.
448 """
449 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
450 -self.zlib.MAX_WBITS,
451 self.zlib.DEF_MEM_LEVEL,
452 0)
Guido van Rossume2a383d2007-01-15 16:59:06 +0000453 timestamp = struct.pack("<L", int(time.time()))
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000454 self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000455 if self.name.endswith(".gz"):
456 self.name = self.name[:-3]
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000457 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
458 self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000459
460 def write(self, s):
461 """Write string s to the stream.
462 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000463 if self.comptype == "gz":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000464 self.crc = self.zlib.crc32(s, self.crc)
465 self.pos += len(s)
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000466 if self.comptype != "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000467 s = self.cmp.compress(s)
468 self.__write(s)
469
470 def __write(self, s):
471 """Write string s to the stream if a whole new block
472 is ready to be written.
473 """
474 self.buf += s
475 while len(self.buf) > self.bufsize:
476 self.fileobj.write(self.buf[:self.bufsize])
477 self.buf = self.buf[self.bufsize:]
478
479 def close(self):
480 """Close the _Stream object. No operation should be
481 done on it afterwards.
482 """
483 if self.closed:
484 return
485
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000486 if self.mode == "w" and self.comptype != "tar":
Martin v. Löwisc234a522004-08-22 21:28:33 +0000487 self.buf += self.cmp.flush()
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000488
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000489 if self.mode == "w" and self.buf:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000490 self.fileobj.write(self.buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000491 self.buf = b""
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000492 if self.comptype == "gz":
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000493 # The native zlib crc is an unsigned 32-bit integer, but
494 # the Python wrapper implicitly casts that to a signed C
495 # long. So, on a 32-bit box self.crc may "look negative",
496 # while the same crc on a 64-bit box may "look positive".
497 # To avoid irksome warnings from the `struct` module, force
498 # it to look positive on all boxes.
Guido van Rossume2a383d2007-01-15 16:59:06 +0000499 self.fileobj.write(struct.pack("<L", self.crc & 0xffffffff))
500 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000501
502 if not self._extfileobj:
503 self.fileobj.close()
504
505 self.closed = True
506
507 def _init_read_gz(self):
508 """Initialize for reading a gzip compressed fileobj.
509 """
510 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000511 self.dbuf = b""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000512
513 # taken from gzip.GzipFile with some alterations
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000514 if self.__read(2) != b"\037\213":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000515 raise ReadError("not a gzip file")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000516 if self.__read(1) != b"\010":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000517 raise CompressionError("unsupported compression method")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000518
519 flag = ord(self.__read(1))
520 self.__read(6)
521
522 if flag & 4:
523 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
524 self.read(xlen)
525 if flag & 8:
526 while True:
527 s = self.__read(1)
528 if not s or s == NUL:
529 break
530 if flag & 16:
531 while True:
532 s = self.__read(1)
533 if not s or s == NUL:
534 break
535 if flag & 2:
536 self.__read(2)
537
538 def tell(self):
539 """Return the stream's file pointer position.
540 """
541 return self.pos
542
543 def seek(self, pos=0):
544 """Set the stream's file pointer to pos. Negative seeking
545 is forbidden.
546 """
547 if pos - self.pos >= 0:
548 blocks, remainder = divmod(pos - self.pos, self.bufsize)
Guido van Rossum805365e2007-05-07 22:24:25 +0000549 for i in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000550 self.read(self.bufsize)
551 self.read(remainder)
552 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553 raise StreamError("seeking backwards is not allowed")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000554 return self.pos
555
556 def read(self, size=None):
557 """Return the next size number of bytes from the stream.
558 If size is not defined, return all bytes of the stream
559 up to EOF.
560 """
561 if size is None:
562 t = []
563 while True:
564 buf = self._read(self.bufsize)
565 if not buf:
566 break
567 t.append(buf)
568 buf = "".join(t)
569 else:
570 buf = self._read(size)
571 self.pos += len(buf)
572 return buf
573
574 def _read(self, size):
575 """Return size bytes from the stream.
576 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000577 if self.comptype == "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000578 return self.__read(size)
579
580 c = len(self.dbuf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000581 while c < size:
582 buf = self.__read(self.bufsize)
583 if not buf:
584 break
Guido van Rossumd8faa362007-04-27 19:54:29 +0000585 try:
586 buf = self.cmp.decompress(buf)
587 except IOError:
588 raise ReadError("invalid compressed data")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000589 self.dbuf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000590 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000591 buf = self.dbuf[:size]
592 self.dbuf = self.dbuf[size:]
593 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000594
595 def __read(self, size):
596 """Return size bytes from stream. If internal buffer is empty,
597 read another block from the stream.
598 """
599 c = len(self.buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000600 while c < size:
601 buf = self.fileobj.read(self.bufsize)
602 if not buf:
603 break
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000604 self.buf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000605 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000606 buf = self.buf[:size]
607 self.buf = self.buf[size:]
608 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000609# class _Stream
610
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000611class _StreamProxy(object):
612 """Small proxy class that enables transparent compression
613 detection for the Stream interface (mode 'r|*').
614 """
615
616 def __init__(self, fileobj):
617 self.fileobj = fileobj
618 self.buf = self.fileobj.read(BLOCKSIZE)
619
620 def read(self, size):
621 self.read = self.fileobj.read
622 return self.buf
623
624 def getcomptype(self):
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000625 if self.buf.startswith(b"\037\213\010"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000626 return "gz"
Lars Gustäbeled1ac582011-12-06 12:56:38 +0100627 if self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000628 return "bz2"
629 return "tar"
630
631 def close(self):
632 self.fileobj.close()
633# class StreamProxy
634
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000635#------------------------
636# Extraction file object
637#------------------------
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000638class _FileInFile(object):
639 """A thin wrapper around an existing file object that
640 provides a part of its data as an individual file
641 object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000642 """
643
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000644 def __init__(self, fileobj, offset, size, blockinfo=None):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000645 self.fileobj = fileobj
646 self.offset = offset
647 self.size = size
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000648 self.position = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000649
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000650 if blockinfo is None:
651 blockinfo = [(0, size)]
652
653 # Construct a map with data and zero blocks.
654 self.map_index = 0
655 self.map = []
656 lastpos = 0
657 realpos = self.offset
658 for offset, size in blockinfo:
659 if offset > lastpos:
660 self.map.append((False, lastpos, offset, None))
661 self.map.append((True, offset, offset + size, realpos))
662 realpos += size
663 lastpos = offset + size
664 if lastpos < self.size:
665 self.map.append((False, lastpos, self.size, None))
666
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000667 def seekable(self):
668 if not hasattr(self.fileobj, "seekable"):
669 # XXX gzip.GzipFile and bz2.BZ2File
670 return True
671 return self.fileobj.seekable()
672
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000673 def tell(self):
674 """Return the current file position.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000675 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000676 return self.position
677
678 def seek(self, position):
679 """Seek to a position in the file.
680 """
681 self.position = position
682
683 def read(self, size=None):
684 """Read data from the file.
685 """
686 if size is None:
687 size = self.size - self.position
688 else:
689 size = min(size, self.size - self.position)
690
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000691 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000692 while size > 0:
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000693 while True:
694 data, start, stop, offset = self.map[self.map_index]
695 if start <= self.position < stop:
696 break
697 else:
698 self.map_index += 1
699 if self.map_index == len(self.map):
700 self.map_index = 0
701 length = min(size, stop - self.position)
702 if data:
Lars Gustäbeldd071042011-02-23 11:42:22 +0000703 self.fileobj.seek(offset + (self.position - start))
704 buf += self.fileobj.read(length)
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000705 else:
706 buf += NUL * length
707 size -= length
708 self.position += length
709 return buf
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000710#class _FileInFile
711
712
713class ExFileObject(object):
714 """File-like object for reading an archive member.
715 Is returned by TarFile.extractfile().
716 """
717 blocksize = 1024
718
719 def __init__(self, tarfile, tarinfo):
720 self.fileobj = _FileInFile(tarfile.fileobj,
721 tarinfo.offset_data,
722 tarinfo.size,
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000723 tarinfo.sparse)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000724 self.name = tarinfo.name
725 self.mode = "r"
726 self.closed = False
727 self.size = tarinfo.size
728
729 self.position = 0
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000730 self.buffer = b""
731
732 def readable(self):
733 return True
734
735 def writable(self):
736 return False
737
738 def seekable(self):
739 return self.fileobj.seekable()
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000740
741 def read(self, size=None):
742 """Read at most size bytes from the file. If size is not
743 present or None, read all data until EOF is reached.
744 """
745 if self.closed:
746 raise ValueError("I/O operation on closed file")
747
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000748 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000749 if self.buffer:
750 if size is None:
751 buf = self.buffer
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000752 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000753 else:
754 buf = self.buffer[:size]
755 self.buffer = self.buffer[size:]
756
757 if size is None:
758 buf += self.fileobj.read()
759 else:
760 buf += self.fileobj.read(size - len(buf))
761
762 self.position += len(buf)
763 return buf
764
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000765 # XXX TextIOWrapper uses the read1() method.
766 read1 = read
767
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000768 def readline(self, size=-1):
769 """Read one entire line from the file. If size is present
770 and non-negative, return a string with at most that
771 size, which may be an incomplete line.
772 """
773 if self.closed:
774 raise ValueError("I/O operation on closed file")
775
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000776 pos = self.buffer.find(b"\n") + 1
777 if pos == 0:
778 # no newline found.
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000779 while True:
780 buf = self.fileobj.read(self.blocksize)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000781 self.buffer += buf
782 if not buf or b"\n" in buf:
783 pos = self.buffer.find(b"\n") + 1
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000784 if pos == 0:
785 # no newline found.
786 pos = len(self.buffer)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000787 break
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000788
789 if size != -1:
790 pos = min(size, pos)
791
792 buf = self.buffer[:pos]
793 self.buffer = self.buffer[pos:]
794 self.position += len(buf)
795 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000796
797 def readlines(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000798 """Return a list with all remaining lines.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000799 """
800 result = []
801 while True:
802 line = self.readline()
803 if not line: break
804 result.append(line)
805 return result
806
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000807 def tell(self):
808 """Return the current file position.
809 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000810 if self.closed:
811 raise ValueError("I/O operation on closed file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000812
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000813 return self.position
814
815 def seek(self, pos, whence=os.SEEK_SET):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000816 """Seek to a position in the file.
817 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000818 if self.closed:
819 raise ValueError("I/O operation on closed file")
820
821 if whence == os.SEEK_SET:
822 self.position = min(max(pos, 0), self.size)
823 elif whence == os.SEEK_CUR:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000824 if pos < 0:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000825 self.position = max(self.position + pos, 0)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000826 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000827 self.position = min(self.position + pos, self.size)
828 elif whence == os.SEEK_END:
829 self.position = max(min(self.size + pos, self.size), 0)
830 else:
831 raise ValueError("Invalid argument")
832
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000833 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000834 self.fileobj.seek(self.position)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000835
836 def close(self):
837 """Close the file object.
838 """
839 self.closed = True
Martin v. Löwisdf241532005-03-03 08:17:42 +0000840
841 def __iter__(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000842 """Get an iterator over the file's lines.
Martin v. Löwisdf241532005-03-03 08:17:42 +0000843 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000844 while True:
845 line = self.readline()
846 if not line:
847 break
848 yield line
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000849#class ExFileObject
850
851#------------------
852# Exported Classes
853#------------------
854class TarInfo(object):
855 """Informational class which holds the details about an
856 archive member given by a tar header block.
857 TarInfo objects are returned by TarFile.getmember(),
858 TarFile.getmembers() and TarFile.gettarinfo() and are
859 usually created internally.
860 """
861
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000862 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
863 "chksum", "type", "linkname", "uname", "gname",
864 "devmajor", "devminor",
865 "offset", "offset_data", "pax_headers", "sparse",
866 "tarfile", "_sparse_structs", "_link_target")
867
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000868 def __init__(self, name=""):
869 """Construct a TarInfo object. name is the optional name
870 of the member.
871 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000872 self.name = name # member name
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000873 self.mode = 0o644 # file permissions
Thomas Wouters477c8d52006-05-27 19:21:47 +0000874 self.uid = 0 # user id
875 self.gid = 0 # group id
876 self.size = 0 # file size
877 self.mtime = 0 # modification time
878 self.chksum = 0 # header checksum
879 self.type = REGTYPE # member type
880 self.linkname = "" # link name
Lars Gustäbel331b8002010-10-04 15:18:47 +0000881 self.uname = "" # user name
882 self.gname = "" # group name
Thomas Wouters477c8d52006-05-27 19:21:47 +0000883 self.devmajor = 0 # device major number
884 self.devminor = 0 # device minor number
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000885
Thomas Wouters477c8d52006-05-27 19:21:47 +0000886 self.offset = 0 # the tar header starts here
887 self.offset_data = 0 # the file's data starts here
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000888
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000889 self.sparse = None # sparse member information
Guido van Rossumd8faa362007-04-27 19:54:29 +0000890 self.pax_headers = {} # pax header information
891
892 # In pax headers the "name" and "linkname" field are called
893 # "path" and "linkpath".
894 def _getpath(self):
895 return self.name
896 def _setpath(self, name):
897 self.name = name
898 path = property(_getpath, _setpath)
899
900 def _getlinkpath(self):
901 return self.linkname
902 def _setlinkpath(self, linkname):
903 self.linkname = linkname
904 linkpath = property(_getlinkpath, _setlinkpath)
905
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000906 def __repr__(self):
907 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
908
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000909 def get_info(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000910 """Return the TarInfo's attributes as a dictionary.
911 """
912 info = {
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000913 "name": self.name,
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000914 "mode": self.mode & 0o7777,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000915 "uid": self.uid,
916 "gid": self.gid,
917 "size": self.size,
918 "mtime": self.mtime,
919 "chksum": self.chksum,
920 "type": self.type,
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000921 "linkname": self.linkname,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000922 "uname": self.uname,
923 "gname": self.gname,
924 "devmajor": self.devmajor,
925 "devminor": self.devminor
926 }
927
928 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
929 info["name"] += "/"
930
931 return info
932
Victor Stinnerde629d42010-05-05 21:43:57 +0000933 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000934 """Return a tar header as a string of 512 byte blocks.
935 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000936 info = self.get_info()
Guido van Rossume7ba4952007-06-06 23:52:48 +0000937
Guido van Rossumd8faa362007-04-27 19:54:29 +0000938 if format == USTAR_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000939 return self.create_ustar_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000940 elif format == GNU_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000941 return self.create_gnu_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000942 elif format == PAX_FORMAT:
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000943 return self.create_pax_header(info, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000944 else:
945 raise ValueError("invalid format")
946
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000947 def create_ustar_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000948 """Return the object as a ustar header block.
949 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000950 info["magic"] = POSIX_MAGIC
951
952 if len(info["linkname"]) > LENGTH_LINK:
953 raise ValueError("linkname is too long")
954
955 if len(info["name"]) > LENGTH_NAME:
956 info["prefix"], info["name"] = self._posix_split_name(info["name"])
957
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000958 return self._create_header(info, USTAR_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000959
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000960 def create_gnu_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000961 """Return the object as a GNU header block sequence.
962 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000963 info["magic"] = GNU_MAGIC
964
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000965 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +0000966 if len(info["linkname"]) > LENGTH_LINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000967 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000968
969 if len(info["name"]) > LENGTH_NAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000970 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000971
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000972 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000973
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000974 def create_pax_header(self, info, encoding):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000975 """Return the object as a ustar header block. If it cannot be
976 represented this way, prepend a pax extended header sequence
977 with supplement information.
978 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000979 info["magic"] = POSIX_MAGIC
980 pax_headers = self.pax_headers.copy()
981
982 # Test string fields for values that exceed the field length or cannot
983 # be represented in ASCII encoding.
984 for name, hname, length in (
985 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
986 ("uname", "uname", 32), ("gname", "gname", 32)):
987
Guido van Rossume7ba4952007-06-06 23:52:48 +0000988 if hname in pax_headers:
989 # The pax header has priority.
990 continue
991
Guido van Rossumd8faa362007-04-27 19:54:29 +0000992 # Try to encode the string as ASCII.
993 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000994 info[name].encode("ascii", "strict")
Guido van Rossumd8faa362007-04-27 19:54:29 +0000995 except UnicodeEncodeError:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000996 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +0000997 continue
998
Guido van Rossume7ba4952007-06-06 23:52:48 +0000999 if len(info[name]) > length:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001000 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001001
1002 # Test number fields for values that exceed the field limit or values
1003 # that like to be stored as float.
1004 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001005 if name in pax_headers:
1006 # The pax header has priority. Avoid overflow.
1007 info[name] = 0
1008 continue
1009
Guido van Rossumd8faa362007-04-27 19:54:29 +00001010 val = info[name]
1011 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001012 pax_headers[name] = str(val)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001013 info[name] = 0
1014
Guido van Rossume7ba4952007-06-06 23:52:48 +00001015 # Create a pax extended header if necessary.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001016 if pax_headers:
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001017 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001018 else:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001019 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +00001020
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001021 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001022
1023 @classmethod
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001024 def create_pax_global_header(cls, pax_headers):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001025 """Return the object as a pax global header block sequence.
1026 """
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001027 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001028
1029 def _posix_split_name(self, name):
1030 """Split a name longer than 100 chars into a prefix
1031 and a name part.
1032 """
1033 prefix = name[:LENGTH_PREFIX + 1]
1034 while prefix and prefix[-1] != "/":
1035 prefix = prefix[:-1]
1036
1037 name = name[len(prefix):]
1038 prefix = prefix[:-1]
1039
1040 if not prefix or len(name) > LENGTH_NAME:
1041 raise ValueError("name is too long")
1042 return prefix, name
1043
1044 @staticmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001045 def _create_header(info, format, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001046 """Return a header block. info is a dictionary with file
1047 information, format must be one of the *_FORMAT constants.
1048 """
1049 parts = [
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001050 stn(info.get("name", ""), 100, encoding, errors),
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001051 itn(info.get("mode", 0) & 0o7777, 8, format),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001052 itn(info.get("uid", 0), 8, format),
1053 itn(info.get("gid", 0), 8, format),
1054 itn(info.get("size", 0), 12, format),
1055 itn(info.get("mtime", 0), 12, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001056 b" ", # checksum field
Guido van Rossumd8faa362007-04-27 19:54:29 +00001057 info.get("type", REGTYPE),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001058 stn(info.get("linkname", ""), 100, encoding, errors),
1059 info.get("magic", POSIX_MAGIC),
Lars Gustäbel331b8002010-10-04 15:18:47 +00001060 stn(info.get("uname", ""), 32, encoding, errors),
1061 stn(info.get("gname", ""), 32, encoding, errors),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001062 itn(info.get("devmajor", 0), 8, format),
1063 itn(info.get("devminor", 0), 8, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001064 stn(info.get("prefix", ""), 155, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001065 ]
1066
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001067 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001068 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
Lars Gustäbela280ca72007-08-28 07:34:33 +00001069 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001070 return buf
1071
1072 @staticmethod
1073 def _create_payload(payload):
1074 """Return the string payload filled with zero bytes
1075 up to the next 512 byte border.
1076 """
1077 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1078 if remainder > 0:
1079 payload += (BLOCKSIZE - remainder) * NUL
1080 return payload
1081
1082 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001083 def _create_gnu_long_header(cls, name, type, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001084 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1085 for name.
1086 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001087 name = name.encode(encoding, errors) + NUL
Guido van Rossumd8faa362007-04-27 19:54:29 +00001088
1089 info = {}
1090 info["name"] = "././@LongLink"
1091 info["type"] = type
1092 info["size"] = len(name)
1093 info["magic"] = GNU_MAGIC
1094
1095 # create extended header + name blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001096 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001097 cls._create_payload(name)
1098
1099 @classmethod
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001100 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1101 """Return a POSIX.1-2008 extended or global header sequence
Guido van Rossumd8faa362007-04-27 19:54:29 +00001102 that contains a list of keyword, value pairs. The values
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001103 must be strings.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001104 """
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001105 # Check if one of the fields contains surrogate characters and thereby
1106 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1107 binary = False
1108 for keyword, value in pax_headers.items():
1109 try:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001110 value.encode("utf-8", "strict")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001111 except UnicodeEncodeError:
1112 binary = True
1113 break
1114
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001115 records = b""
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001116 if binary:
1117 # Put the hdrcharset field at the beginning of the header.
1118 records += b"21 hdrcharset=BINARY\n"
1119
Guido van Rossumd8faa362007-04-27 19:54:29 +00001120 for keyword, value in pax_headers.items():
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001121 keyword = keyword.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001122 if binary:
1123 # Try to restore the original byte representation of `value'.
1124 # Needless to say, that the encoding must match the string.
1125 value = value.encode(encoding, "surrogateescape")
1126 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001127 value = value.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001128
Guido van Rossumd8faa362007-04-27 19:54:29 +00001129 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1130 n = p = 0
1131 while True:
1132 n = l + len(str(p))
1133 if n == p:
1134 break
1135 p = n
Lars Gustäbela280ca72007-08-28 07:34:33 +00001136 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
Guido van Rossumd8faa362007-04-27 19:54:29 +00001137
1138 # We use a hardcoded "././@PaxHeader" name like star does
1139 # instead of the one that POSIX recommends.
1140 info = {}
1141 info["name"] = "././@PaxHeader"
1142 info["type"] = type
1143 info["size"] = len(records)
1144 info["magic"] = POSIX_MAGIC
1145
1146 # Create pax header + record blocks.
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001147 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001148 cls._create_payload(records)
1149
Guido van Rossum75b64e62005-01-16 00:16:11 +00001150 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001151 def frombuf(cls, buf, encoding, errors):
1152 """Construct a TarInfo object from a 512 byte bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001153 """
Lars Gustäbel9520a432009-11-22 18:48:49 +00001154 if len(buf) == 0:
1155 raise EmptyHeaderError("empty header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001156 if len(buf) != BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001157 raise TruncatedHeaderError("truncated header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001158 if buf.count(NUL) == BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001159 raise EOFHeaderError("end of file header")
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001160
1161 chksum = nti(buf[148:156])
1162 if chksum not in calc_chksums(buf):
Lars Gustäbel9520a432009-11-22 18:48:49 +00001163 raise InvalidHeaderError("bad checksum")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001164
Guido van Rossumd8faa362007-04-27 19:54:29 +00001165 obj = cls()
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001166 obj.name = nts(buf[0:100], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001167 obj.mode = nti(buf[100:108])
1168 obj.uid = nti(buf[108:116])
1169 obj.gid = nti(buf[116:124])
1170 obj.size = nti(buf[124:136])
1171 obj.mtime = nti(buf[136:148])
1172 obj.chksum = chksum
1173 obj.type = buf[156:157]
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001174 obj.linkname = nts(buf[157:257], encoding, errors)
1175 obj.uname = nts(buf[265:297], encoding, errors)
1176 obj.gname = nts(buf[297:329], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001177 obj.devmajor = nti(buf[329:337])
1178 obj.devminor = nti(buf[337:345])
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001179 prefix = nts(buf[345:500], encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001180
Guido van Rossumd8faa362007-04-27 19:54:29 +00001181 # Old V7 tar format represents a directory as a regular
1182 # file with a trailing slash.
1183 if obj.type == AREGTYPE and obj.name.endswith("/"):
1184 obj.type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001185
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001186 # The old GNU sparse format occupies some of the unused
1187 # space in the buffer for up to 4 sparse structures.
1188 # Save the them for later processing in _proc_sparse().
1189 if obj.type == GNUTYPE_SPARSE:
1190 pos = 386
1191 structs = []
1192 for i in range(4):
1193 try:
1194 offset = nti(buf[pos:pos + 12])
1195 numbytes = nti(buf[pos + 12:pos + 24])
1196 except ValueError:
1197 break
1198 structs.append((offset, numbytes))
1199 pos += 24
1200 isextended = bool(buf[482])
1201 origsize = nti(buf[483:495])
1202 obj._sparse_structs = (structs, isextended, origsize)
1203
Guido van Rossumd8faa362007-04-27 19:54:29 +00001204 # Remove redundant slashes from directories.
1205 if obj.isdir():
1206 obj.name = obj.name.rstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001207
Guido van Rossumd8faa362007-04-27 19:54:29 +00001208 # Reconstruct a ustar longname.
1209 if prefix and obj.type not in GNU_TYPES:
1210 obj.name = prefix + "/" + obj.name
1211 return obj
1212
1213 @classmethod
1214 def fromtarfile(cls, tarfile):
1215 """Return the next TarInfo object from TarFile object
1216 tarfile.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001217 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001218 buf = tarfile.fileobj.read(BLOCKSIZE)
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001219 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001220 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1221 return obj._proc_member(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001222
Guido van Rossumd8faa362007-04-27 19:54:29 +00001223 #--------------------------------------------------------------------------
1224 # The following are methods that are called depending on the type of a
1225 # member. The entry point is _proc_member() which can be overridden in a
1226 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1227 # implement the following
1228 # operations:
1229 # 1. Set self.offset_data to the position where the data blocks begin,
1230 # if there is data that follows.
1231 # 2. Set tarfile.offset to the position where the next member's header will
1232 # begin.
1233 # 3. Return self or another valid TarInfo object.
1234 def _proc_member(self, tarfile):
1235 """Choose the right processing method depending on
1236 the type and call it.
Thomas Wouters89f507f2006-12-13 04:49:30 +00001237 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001238 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1239 return self._proc_gnulong(tarfile)
1240 elif self.type == GNUTYPE_SPARSE:
1241 return self._proc_sparse(tarfile)
1242 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1243 return self._proc_pax(tarfile)
1244 else:
1245 return self._proc_builtin(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001246
Guido van Rossumd8faa362007-04-27 19:54:29 +00001247 def _proc_builtin(self, tarfile):
1248 """Process a builtin type or an unknown type which
1249 will be treated as a regular file.
1250 """
1251 self.offset_data = tarfile.fileobj.tell()
1252 offset = self.offset_data
1253 if self.isreg() or self.type not in SUPPORTED_TYPES:
1254 # Skip the following data blocks.
1255 offset += self._block(self.size)
1256 tarfile.offset = offset
Thomas Wouters89f507f2006-12-13 04:49:30 +00001257
Guido van Rossume7ba4952007-06-06 23:52:48 +00001258 # Patch the TarInfo object with saved global
Guido van Rossumd8faa362007-04-27 19:54:29 +00001259 # header information.
Guido van Rossume7ba4952007-06-06 23:52:48 +00001260 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001261
1262 return self
1263
1264 def _proc_gnulong(self, tarfile):
1265 """Process the blocks that hold a GNU longname
1266 or longlink member.
1267 """
1268 buf = tarfile.fileobj.read(self._block(self.size))
1269
1270 # Fetch the next header and process it.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001271 try:
1272 next = self.fromtarfile(tarfile)
1273 except HeaderError:
1274 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001275
1276 # Patch the TarInfo object from the next header with
1277 # the longname information.
1278 next.offset = self.offset
1279 if self.type == GNUTYPE_LONGNAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001280 next.name = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001281 elif self.type == GNUTYPE_LONGLINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001282 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001283
1284 return next
1285
1286 def _proc_sparse(self, tarfile):
1287 """Process a GNU sparse header plus extra headers.
1288 """
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001289 # We already collected some sparse structures in frombuf().
1290 structs, isextended, origsize = self._sparse_structs
1291 del self._sparse_structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001292
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001293 # Collect sparse structures from extended header blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001294 while isextended:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001295 buf = tarfile.fileobj.read(BLOCKSIZE)
1296 pos = 0
Guido van Rossum805365e2007-05-07 22:24:25 +00001297 for i in range(21):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001298 try:
1299 offset = nti(buf[pos:pos + 12])
1300 numbytes = nti(buf[pos + 12:pos + 24])
1301 except ValueError:
1302 break
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001303 if offset and numbytes:
1304 structs.append((offset, numbytes))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001305 pos += 24
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001306 isextended = bool(buf[504])
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001307 self.sparse = structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001308
1309 self.offset_data = tarfile.fileobj.tell()
1310 tarfile.offset = self.offset_data + self._block(self.size)
1311 self.size = origsize
Guido van Rossumd8faa362007-04-27 19:54:29 +00001312 return self
1313
1314 def _proc_pax(self, tarfile):
1315 """Process an extended or global header as described in
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001316 POSIX.1-2008.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001317 """
1318 # Read the header information.
1319 buf = tarfile.fileobj.read(self._block(self.size))
1320
1321 # A pax header stores supplemental information for either
1322 # the following file (extended) or all following files
1323 # (global).
1324 if self.type == XGLTYPE:
1325 pax_headers = tarfile.pax_headers
1326 else:
1327 pax_headers = tarfile.pax_headers.copy()
1328
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001329 # Check if the pax header contains a hdrcharset field. This tells us
1330 # the encoding of the path, linkpath, uname and gname fields. Normally,
1331 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1332 # implementations are allowed to store them as raw binary strings if
1333 # the translation to UTF-8 fails.
1334 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1335 if match is not None:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001336 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001337
1338 # For the time being, we don't care about anything other than "BINARY".
1339 # The only other value that is currently allowed by the standard is
1340 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1341 hdrcharset = pax_headers.get("hdrcharset")
1342 if hdrcharset == "BINARY":
1343 encoding = tarfile.encoding
1344 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001345 encoding = "utf-8"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001346
Guido van Rossumd8faa362007-04-27 19:54:29 +00001347 # Parse pax header information. A record looks like that:
1348 # "%d %s=%s\n" % (length, keyword, value). length is the size
1349 # of the complete record including the length field itself and
Guido van Rossume7ba4952007-06-06 23:52:48 +00001350 # the newline. keyword and value are both UTF-8 encoded strings.
Antoine Pitroufd036452008-08-19 17:56:33 +00001351 regex = re.compile(br"(\d+) ([^=]+)=")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001352 pos = 0
1353 while True:
1354 match = regex.match(buf, pos)
1355 if not match:
1356 break
1357
1358 length, keyword = match.groups()
1359 length = int(length)
1360 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1361
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001362 # Normally, we could just use "utf-8" as the encoding and "strict"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001363 # as the error handler, but we better not take the risk. For
1364 # example, GNU tar <= 1.23 is known to store filenames it cannot
1365 # translate to UTF-8 as raw strings (unfortunately without a
1366 # hdrcharset=BINARY header).
1367 # We first try the strict standard encoding, and if that fails we
1368 # fall back on the user's encoding and error handler.
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001369 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001370 tarfile.errors)
1371 if keyword in PAX_NAME_FIELDS:
1372 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1373 tarfile.errors)
1374 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001375 value = self._decode_pax_field(value, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001376 tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001377
1378 pax_headers[keyword] = value
1379 pos += length
1380
Guido van Rossume7ba4952007-06-06 23:52:48 +00001381 # Fetch the next header.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001382 try:
1383 next = self.fromtarfile(tarfile)
1384 except HeaderError:
1385 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001386
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001387 # Process GNU sparse information.
1388 if "GNU.sparse.map" in pax_headers:
1389 # GNU extended sparse format version 0.1.
1390 self._proc_gnusparse_01(next, pax_headers)
1391
1392 elif "GNU.sparse.size" in pax_headers:
1393 # GNU extended sparse format version 0.0.
1394 self._proc_gnusparse_00(next, pax_headers, buf)
1395
1396 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1397 # GNU extended sparse format version 1.0.
1398 self._proc_gnusparse_10(next, pax_headers, tarfile)
1399
Guido van Rossume7ba4952007-06-06 23:52:48 +00001400 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001401 # Patch the TarInfo object with the extended header info.
1402 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1403 next.offset = self.offset
1404
1405 if "size" in pax_headers:
1406 # If the extended header replaces the size field,
1407 # we need to recalculate the offset where the next
1408 # header starts.
1409 offset = next.offset_data
1410 if next.isreg() or next.type not in SUPPORTED_TYPES:
1411 offset += next._block(next.size)
1412 tarfile.offset = offset
1413
1414 return next
1415
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001416 def _proc_gnusparse_00(self, next, pax_headers, buf):
1417 """Process a GNU tar extended sparse header, version 0.0.
1418 """
1419 offsets = []
1420 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1421 offsets.append(int(match.group(1)))
1422 numbytes = []
1423 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1424 numbytes.append(int(match.group(1)))
1425 next.sparse = list(zip(offsets, numbytes))
1426
1427 def _proc_gnusparse_01(self, next, pax_headers):
1428 """Process a GNU tar extended sparse header, version 0.1.
1429 """
1430 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1431 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1432
1433 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1434 """Process a GNU tar extended sparse header, version 1.0.
1435 """
1436 fields = None
1437 sparse = []
1438 buf = tarfile.fileobj.read(BLOCKSIZE)
1439 fields, buf = buf.split(b"\n", 1)
1440 fields = int(fields)
1441 while len(sparse) < fields * 2:
1442 if b"\n" not in buf:
1443 buf += tarfile.fileobj.read(BLOCKSIZE)
1444 number, buf = buf.split(b"\n", 1)
1445 sparse.append(int(number))
1446 next.offset_data = tarfile.fileobj.tell()
1447 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1448
Guido van Rossume7ba4952007-06-06 23:52:48 +00001449 def _apply_pax_info(self, pax_headers, encoding, errors):
1450 """Replace fields with supplemental information from a previous
1451 pax extended or global header.
1452 """
1453 for keyword, value in pax_headers.items():
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001454 if keyword == "GNU.sparse.name":
1455 setattr(self, "path", value)
1456 elif keyword == "GNU.sparse.size":
1457 setattr(self, "size", int(value))
1458 elif keyword == "GNU.sparse.realsize":
1459 setattr(self, "size", int(value))
1460 elif keyword in PAX_FIELDS:
1461 if keyword in PAX_NUMBER_FIELDS:
1462 try:
1463 value = PAX_NUMBER_FIELDS[keyword](value)
1464 except ValueError:
1465 value = 0
1466 if keyword == "path":
1467 value = value.rstrip("/")
1468 setattr(self, keyword, value)
Guido van Rossume7ba4952007-06-06 23:52:48 +00001469
1470 self.pax_headers = pax_headers.copy()
Guido van Rossumd8faa362007-04-27 19:54:29 +00001471
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001472 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1473 """Decode a single field from a pax record.
1474 """
1475 try:
1476 return value.decode(encoding, "strict")
1477 except UnicodeDecodeError:
1478 return value.decode(fallback_encoding, fallback_errors)
1479
Guido van Rossumd8faa362007-04-27 19:54:29 +00001480 def _block(self, count):
1481 """Round up a byte count by BLOCKSIZE and return it,
1482 e.g. _block(834) => 1024.
1483 """
1484 blocks, remainder = divmod(count, BLOCKSIZE)
1485 if remainder:
1486 blocks += 1
1487 return blocks * BLOCKSIZE
Thomas Wouters89f507f2006-12-13 04:49:30 +00001488
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001489 def isreg(self):
1490 return self.type in REGULAR_TYPES
1491 def isfile(self):
1492 return self.isreg()
1493 def isdir(self):
1494 return self.type == DIRTYPE
1495 def issym(self):
1496 return self.type == SYMTYPE
1497 def islnk(self):
1498 return self.type == LNKTYPE
1499 def ischr(self):
1500 return self.type == CHRTYPE
1501 def isblk(self):
1502 return self.type == BLKTYPE
1503 def isfifo(self):
1504 return self.type == FIFOTYPE
1505 def issparse(self):
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001506 return self.sparse is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001507 def isdev(self):
1508 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1509# class TarInfo
1510
1511class TarFile(object):
1512 """The TarFile Class provides an interface to tar archives.
1513 """
1514
1515 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1516
1517 dereference = False # If true, add content of linked file to the
1518 # tar file, else the link.
1519
1520 ignore_zeros = False # If true, skips empty or invalid blocks and
1521 # continues processing.
1522
Lars Gustäbel365aff32009-12-13 11:42:29 +00001523 errorlevel = 1 # If 0, fatal errors only appear in debug
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001524 # messages (if debug >= 0). If > 0, errors
1525 # are passed to the caller as exceptions.
1526
Guido van Rossumd8faa362007-04-27 19:54:29 +00001527 format = DEFAULT_FORMAT # The format to use when creating an archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001528
Guido van Rossume7ba4952007-06-06 23:52:48 +00001529 encoding = ENCODING # Encoding for 8-bit character strings.
1530
1531 errors = None # Error handler for unicode conversion.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001532
Guido van Rossumd8faa362007-04-27 19:54:29 +00001533 tarinfo = TarInfo # The default TarInfo class to use.
1534
1535 fileobject = ExFileObject # The default ExFileObject class to use.
1536
1537 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1538 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
Victor Stinnerde629d42010-05-05 21:43:57 +00001539 errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001540 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1541 read from an existing archive, 'a' to append data to an existing
1542 file or 'w' to create a new file overwriting an existing one. `mode'
1543 defaults to 'r'.
1544 If `fileobj' is given, it is used for reading or writing data. If it
1545 can be determined, `mode' is overridden by `fileobj's mode.
1546 `fileobj' is not closed, when TarFile is closed.
1547 """
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001548 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001549 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001550 self.mode = mode
1551 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001552
1553 if not fileobj:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001554 if self.mode == "a" and not os.path.exists(name):
Thomas Wouterscf297e42007-02-23 15:07:44 +00001555 # Create nonexistent files in append mode.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001556 self.mode = "w"
1557 self._mode = "wb"
Guido van Rossume7ba4952007-06-06 23:52:48 +00001558 fileobj = bltn_open(name, self._mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001559 self._extfileobj = False
1560 else:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001561 if name is None and hasattr(fileobj, "name"):
1562 name = fileobj.name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001563 if hasattr(fileobj, "mode"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001564 self._mode = fileobj.mode
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001565 self._extfileobj = True
Thomas Woutersed03b412007-08-28 21:37:11 +00001566 self.name = os.path.abspath(name) if name else None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001567 self.fileobj = fileobj
1568
Guido van Rossumd8faa362007-04-27 19:54:29 +00001569 # Init attributes.
1570 if format is not None:
1571 self.format = format
1572 if tarinfo is not None:
1573 self.tarinfo = tarinfo
1574 if dereference is not None:
1575 self.dereference = dereference
1576 if ignore_zeros is not None:
1577 self.ignore_zeros = ignore_zeros
1578 if encoding is not None:
1579 self.encoding = encoding
Victor Stinnerde629d42010-05-05 21:43:57 +00001580 self.errors = errors
Guido van Rossume7ba4952007-06-06 23:52:48 +00001581
1582 if pax_headers is not None and self.format == PAX_FORMAT:
1583 self.pax_headers = pax_headers
1584 else:
1585 self.pax_headers = {}
1586
Guido van Rossumd8faa362007-04-27 19:54:29 +00001587 if debug is not None:
1588 self.debug = debug
1589 if errorlevel is not None:
1590 self.errorlevel = errorlevel
1591
1592 # Init datastructures.
Thomas Wouters477c8d52006-05-27 19:21:47 +00001593 self.closed = False
1594 self.members = [] # list of members as TarInfo objects
1595 self._loaded = False # flag if all members have been read
Christian Heimesd8654cf2007-12-02 15:22:16 +00001596 self.offset = self.fileobj.tell()
1597 # current position in the archive file
Thomas Wouters477c8d52006-05-27 19:21:47 +00001598 self.inodes = {} # dictionary caching the inodes of
1599 # archive members already added
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001600
Lars Gustäbel7b465392009-11-18 20:29:25 +00001601 try:
1602 if self.mode == "r":
1603 self.firstmember = None
1604 self.firstmember = self.next()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001605
Lars Gustäbel7b465392009-11-18 20:29:25 +00001606 if self.mode == "a":
1607 # Move to the end of the archive,
1608 # before the first empty block.
Lars Gustäbel7b465392009-11-18 20:29:25 +00001609 while True:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001610 self.fileobj.seek(self.offset)
1611 try:
1612 tarinfo = self.tarinfo.fromtarfile(self)
1613 self.members.append(tarinfo)
1614 except EOFHeaderError:
1615 self.fileobj.seek(self.offset)
Lars Gustäbel7b465392009-11-18 20:29:25 +00001616 break
Lars Gustäbel9520a432009-11-22 18:48:49 +00001617 except HeaderError as e:
1618 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001619
Lars Gustäbel7b465392009-11-18 20:29:25 +00001620 if self.mode in "aw":
1621 self._loaded = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001622
Lars Gustäbel7b465392009-11-18 20:29:25 +00001623 if self.pax_headers:
1624 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1625 self.fileobj.write(buf)
1626 self.offset += len(buf)
1627 except:
1628 if not self._extfileobj:
1629 self.fileobj.close()
1630 self.closed = True
1631 raise
Guido van Rossumd8faa362007-04-27 19:54:29 +00001632
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001633 #--------------------------------------------------------------------------
1634 # Below are the classmethods which act as alternate constructors to the
1635 # TarFile class. The open() method is the only one that is needed for
1636 # public use; it is the "super"-constructor and is able to select an
1637 # adequate "sub"-constructor for a particular compression using the mapping
1638 # from OPEN_METH.
1639 #
1640 # This concept allows one to subclass TarFile without losing the comfort of
1641 # the super-constructor. A sub-constructor is registered and made available
1642 # by adding it to the mapping in OPEN_METH.
1643
Guido van Rossum75b64e62005-01-16 00:16:11 +00001644 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001645 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001646 """Open a tar archive for reading, writing or appending. Return
1647 an appropriate TarFile class.
1648
1649 mode:
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001650 'r' or 'r:*' open for reading with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001651 'r:' open for reading exclusively uncompressed
1652 'r:gz' open for reading with gzip compression
1653 'r:bz2' open for reading with bzip2 compression
Thomas Wouterscf297e42007-02-23 15:07:44 +00001654 'a' or 'a:' open for appending, creating the file if necessary
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001655 'w' or 'w:' open for writing without compression
1656 'w:gz' open for writing with gzip compression
1657 'w:bz2' open for writing with bzip2 compression
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001658
1659 'r|*' open a stream of tar blocks with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001660 'r|' open an uncompressed stream of tar blocks for reading
1661 'r|gz' open a gzip compressed stream of tar blocks
1662 'r|bz2' open a bzip2 compressed stream of tar blocks
1663 'w|' open an uncompressed stream for writing
1664 'w|gz' open a gzip compressed stream for writing
1665 'w|bz2' open a bzip2 compressed stream for writing
1666 """
1667
1668 if not name and not fileobj:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001669 raise ValueError("nothing to open")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001670
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001671 if mode in ("r", "r:*"):
1672 # Find out which *open() is appropriate for opening the file.
1673 for comptype in cls.OPEN_METH:
1674 func = getattr(cls, cls.OPEN_METH[comptype])
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001675 if fileobj is not None:
1676 saved_pos = fileobj.tell()
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001677 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001678 return func(name, "r", fileobj, **kwargs)
1679 except (ReadError, CompressionError) as e:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001680 if fileobj is not None:
1681 fileobj.seek(saved_pos)
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001682 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001683 raise ReadError("file could not be opened successfully")
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001684
1685 elif ":" in mode:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001686 filemode, comptype = mode.split(":", 1)
1687 filemode = filemode or "r"
1688 comptype = comptype or "tar"
1689
1690 # Select the *open() function according to
1691 # given compression.
1692 if comptype in cls.OPEN_METH:
1693 func = getattr(cls, cls.OPEN_METH[comptype])
1694 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001695 raise CompressionError("unknown compression type %r" % comptype)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001696 return func(name, filemode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001697
1698 elif "|" in mode:
1699 filemode, comptype = mode.split("|", 1)
1700 filemode = filemode or "r"
1701 comptype = comptype or "tar"
1702
1703 if filemode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001704 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001705
Antoine Pitrou605c2932010-09-23 20:15:14 +00001706 stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1707 try:
1708 t = cls(name, filemode, stream, **kwargs)
1709 except:
1710 stream.close()
1711 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001712 t._extfileobj = False
1713 return t
1714
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001715 elif mode in "aw":
Guido van Rossumd8faa362007-04-27 19:54:29 +00001716 return cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001717
Thomas Wouters477c8d52006-05-27 19:21:47 +00001718 raise ValueError("undiscernible mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001719
Guido van Rossum75b64e62005-01-16 00:16:11 +00001720 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001721 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001722 """Open uncompressed tar archive name for reading or writing.
1723 """
1724 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001725 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001726 return cls(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001727
Guido van Rossum75b64e62005-01-16 00:16:11 +00001728 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001729 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001730 """Open gzip compressed tar archive name for reading or writing.
1731 Appending is not allowed.
1732 """
1733 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001734 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001735
1736 try:
1737 import gzip
Neal Norwitz4ec68242003-04-11 03:05:56 +00001738 gzip.GzipFile
1739 except (ImportError, AttributeError):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001740 raise CompressionError("gzip module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001741
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001742 extfileobj = fileobj is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001743 try:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001744 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
1745 t = cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001746 except IOError:
Georg Brandl3abb3722011-08-13 11:48:12 +02001747 if not extfileobj and fileobj is not None:
Antoine Pitrou95f55602010-09-23 18:36:46 +00001748 fileobj.close()
Georg Brandl3abb3722011-08-13 11:48:12 +02001749 if fileobj is None:
1750 raise
Thomas Wouters477c8d52006-05-27 19:21:47 +00001751 raise ReadError("not a gzip file")
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001752 except:
Georg Brandl3abb3722011-08-13 11:48:12 +02001753 if not extfileobj and fileobj is not None:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001754 fileobj.close()
1755 raise
Antoine Pitrou95f55602010-09-23 18:36:46 +00001756 t._extfileobj = extfileobj
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001757 return t
1758
Guido van Rossum75b64e62005-01-16 00:16:11 +00001759 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001760 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001761 """Open bzip2 compressed tar archive name for reading or writing.
1762 Appending is not allowed.
1763 """
1764 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001765 raise ValueError("mode must be 'r' or 'w'.")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001766
1767 try:
1768 import bz2
1769 except ImportError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001770 raise CompressionError("bz2 module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001771
Lars Gustäbelbb44b732011-12-06 13:44:10 +01001772 fileobj = bz2.BZ2File(filename=name if fileobj is None else None,
1773 mode=mode, fileobj=fileobj, compresslevel=compresslevel)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001774
1775 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001776 t = cls.taropen(name, mode, fileobj, **kwargs)
Lars Gustäbel9520a432009-11-22 18:48:49 +00001777 except (IOError, EOFError):
Antoine Pitrou95f55602010-09-23 18:36:46 +00001778 fileobj.close()
Thomas Wouters477c8d52006-05-27 19:21:47 +00001779 raise ReadError("not a bzip2 file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001780 t._extfileobj = False
1781 return t
1782
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001783 # All *open() methods are registered here.
1784 OPEN_METH = {
1785 "tar": "taropen", # uncompressed tar
1786 "gz": "gzopen", # gzip compressed tar
1787 "bz2": "bz2open" # bzip2 compressed tar
1788 }
1789
1790 #--------------------------------------------------------------------------
1791 # The public methods which TarFile provides:
1792
1793 def close(self):
1794 """Close the TarFile. In write-mode, two finishing zero blocks are
1795 appended to the archive.
1796 """
1797 if self.closed:
1798 return
1799
Guido van Rossumd8faa362007-04-27 19:54:29 +00001800 if self.mode in "aw":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001801 self.fileobj.write(NUL * (BLOCKSIZE * 2))
1802 self.offset += (BLOCKSIZE * 2)
1803 # fill up the end with zero-blocks
1804 # (like option -b20 for tar does)
1805 blocks, remainder = divmod(self.offset, RECORDSIZE)
1806 if remainder > 0:
1807 self.fileobj.write(NUL * (RECORDSIZE - remainder))
1808
1809 if not self._extfileobj:
1810 self.fileobj.close()
1811 self.closed = True
1812
1813 def getmember(self, name):
1814 """Return a TarInfo object for member `name'. If `name' can not be
1815 found in the archive, KeyError is raised. If a member occurs more
Mark Dickinson934896d2009-02-21 20:59:32 +00001816 than once in the archive, its last occurrence is assumed to be the
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001817 most up-to-date version.
1818 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001819 tarinfo = self._getmember(name)
1820 if tarinfo is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001821 raise KeyError("filename %r not found" % name)
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001822 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001823
1824 def getmembers(self):
1825 """Return the members of the archive as a list of TarInfo objects. The
1826 list has the same order as the members in the archive.
1827 """
1828 self._check()
1829 if not self._loaded: # if we want to obtain a list of
1830 self._load() # all members, we first have to
1831 # scan the whole archive.
1832 return self.members
1833
1834 def getnames(self):
1835 """Return the members of the archive as a list of their names. It has
1836 the same order as the list returned by getmembers().
1837 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001838 return [tarinfo.name for tarinfo in self.getmembers()]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001839
1840 def gettarinfo(self, name=None, arcname=None, fileobj=None):
1841 """Create a TarInfo object for either the file `name' or the file
1842 object `fileobj' (using os.fstat on its file descriptor). You can
1843 modify some of the TarInfo's attributes before you add it using
1844 addfile(). If given, `arcname' specifies an alternative name for the
1845 file in the archive.
1846 """
1847 self._check("aw")
1848
1849 # When fileobj is given, replace name by
1850 # fileobj's real name.
1851 if fileobj is not None:
1852 name = fileobj.name
1853
1854 # Building the name of the member in the archive.
1855 # Backward slashes are converted to forward slashes,
1856 # Absolute paths are turned to relative paths.
1857 if arcname is None:
1858 arcname = name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001859 drv, arcname = os.path.splitdrive(arcname)
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00001860 arcname = arcname.replace(os.sep, "/")
1861 arcname = arcname.lstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001862
1863 # Now, fill the TarInfo object with
1864 # information specific for the file.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001865 tarinfo = self.tarinfo()
1866 tarinfo.tarfile = self
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001867
1868 # Use os.stat or os.lstat, depending on platform
1869 # and if symlinks shall be resolved.
1870 if fileobj is None:
1871 if hasattr(os, "lstat") and not self.dereference:
1872 statres = os.lstat(name)
1873 else:
1874 statres = os.stat(name)
1875 else:
1876 statres = os.fstat(fileobj.fileno())
1877 linkname = ""
1878
1879 stmd = statres.st_mode
1880 if stat.S_ISREG(stmd):
1881 inode = (statres.st_ino, statres.st_dev)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001882 if not self.dereference and statres.st_nlink > 1 and \
1883 inode in self.inodes and arcname != self.inodes[inode]:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001884 # Is it a hardlink to an already
1885 # archived file?
1886 type = LNKTYPE
1887 linkname = self.inodes[inode]
1888 else:
1889 # The inode is added only if its valid.
1890 # For win32 it is always 0.
1891 type = REGTYPE
1892 if inode[0]:
1893 self.inodes[inode] = arcname
1894 elif stat.S_ISDIR(stmd):
1895 type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001896 elif stat.S_ISFIFO(stmd):
1897 type = FIFOTYPE
1898 elif stat.S_ISLNK(stmd):
1899 type = SYMTYPE
1900 linkname = os.readlink(name)
1901 elif stat.S_ISCHR(stmd):
1902 type = CHRTYPE
1903 elif stat.S_ISBLK(stmd):
1904 type = BLKTYPE
1905 else:
1906 return None
1907
1908 # Fill the TarInfo object with all
1909 # information we can get.
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001910 tarinfo.name = arcname
1911 tarinfo.mode = stmd
1912 tarinfo.uid = statres.st_uid
1913 tarinfo.gid = statres.st_gid
Lars Gustäbel2470ff12010-06-03 10:11:52 +00001914 if type == REGTYPE:
Martin v. Löwis61d77e02004-08-20 06:35:46 +00001915 tarinfo.size = statres.st_size
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001916 else:
Guido van Rossume2a383d2007-01-15 16:59:06 +00001917 tarinfo.size = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001918 tarinfo.mtime = statres.st_mtime
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001919 tarinfo.type = type
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001920 tarinfo.linkname = linkname
1921 if pwd:
1922 try:
1923 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1924 except KeyError:
1925 pass
1926 if grp:
1927 try:
1928 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1929 except KeyError:
1930 pass
1931
1932 if type in (CHRTYPE, BLKTYPE):
1933 if hasattr(os, "major") and hasattr(os, "minor"):
1934 tarinfo.devmajor = os.major(statres.st_rdev)
1935 tarinfo.devminor = os.minor(statres.st_rdev)
1936 return tarinfo
1937
1938 def list(self, verbose=True):
1939 """Print a table of contents to sys.stdout. If `verbose' is False, only
1940 the names of the members are printed. If it is True, an `ls -l'-like
1941 output is produced.
1942 """
1943 self._check()
1944
1945 for tarinfo in self:
1946 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001947 print(filemode(tarinfo.mode), end=' ')
1948 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
1949 tarinfo.gname or tarinfo.gid), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001950 if tarinfo.ischr() or tarinfo.isblk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001951 print("%10s" % ("%d,%d" \
1952 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001953 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001954 print("%10d" % tarinfo.size, end=' ')
1955 print("%d-%02d-%02d %02d:%02d:%02d" \
1956 % time.localtime(tarinfo.mtime)[:6], end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001957
Guido van Rossumd8faa362007-04-27 19:54:29 +00001958 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001959
1960 if verbose:
1961 if tarinfo.issym():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001962 print("->", tarinfo.linkname, end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001963 if tarinfo.islnk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001964 print("link to", tarinfo.linkname, end=' ')
1965 print()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001966
Raymond Hettingera63a3122011-01-26 20:34:14 +00001967 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001968 """Add the file `name' to the archive. `name' may be any type of file
1969 (directory, fifo, symbolic link, etc.). If given, `arcname'
1970 specifies an alternative name for the file in the archive.
1971 Directories are added recursively by default. This can be avoided by
Guido van Rossum486364b2007-06-30 05:01:58 +00001972 setting `recursive' to False. `exclude' is a function that should
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001973 return True for each filename to be excluded. `filter' is a function
1974 that expects a TarInfo object argument and returns the changed
1975 TarInfo object, if it returns None the TarInfo object will be
1976 excluded from the archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001977 """
1978 self._check("aw")
1979
1980 if arcname is None:
1981 arcname = name
1982
Guido van Rossum486364b2007-06-30 05:01:58 +00001983 # Exclude pathnames.
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00001984 if exclude is not None:
1985 import warnings
1986 warnings.warn("use the filter argument instead",
1987 DeprecationWarning, 2)
1988 if exclude(name):
1989 self._dbg(2, "tarfile: Excluded %r" % name)
1990 return
Guido van Rossum486364b2007-06-30 05:01:58 +00001991
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001992 # Skip if somebody tries to archive the archive...
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001993 if self.name is not None and os.path.abspath(name) == self.name:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001994 self._dbg(2, "tarfile: Skipped %r" % name)
1995 return
1996
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001997 self._dbg(1, name)
1998
1999 # Create a TarInfo object from the file.
2000 tarinfo = self.gettarinfo(name, arcname)
2001
2002 if tarinfo is None:
2003 self._dbg(1, "tarfile: Unsupported type %r" % name)
2004 return
2005
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002006 # Change or exclude the TarInfo object.
2007 if filter is not None:
2008 tarinfo = filter(tarinfo)
2009 if tarinfo is None:
2010 self._dbg(2, "tarfile: Excluded %r" % name)
2011 return
2012
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002013 # Append the tar header and data to the archive.
2014 if tarinfo.isreg():
Guido van Rossume7ba4952007-06-06 23:52:48 +00002015 f = bltn_open(name, "rb")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002016 self.addfile(tarinfo, f)
2017 f.close()
2018
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002019 elif tarinfo.isdir():
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002020 self.addfile(tarinfo)
2021 if recursive:
2022 for f in os.listdir(name):
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002023 self.add(os.path.join(name, f), os.path.join(arcname, f),
Raymond Hettingera63a3122011-01-26 20:34:14 +00002024 recursive, exclude, filter=filter)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002025
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002026 else:
2027 self.addfile(tarinfo)
2028
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002029 def addfile(self, tarinfo, fileobj=None):
2030 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2031 given, tarinfo.size bytes are read from it and added to the archive.
2032 You can create TarInfo objects using gettarinfo().
2033 On Windows platforms, `fileobj' should always be opened with mode
2034 'rb' to avoid irritation about the file size.
2035 """
2036 self._check("aw")
2037
Thomas Wouters89f507f2006-12-13 04:49:30 +00002038 tarinfo = copy.copy(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002039
Guido van Rossume7ba4952007-06-06 23:52:48 +00002040 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002041 self.fileobj.write(buf)
2042 self.offset += len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002043
2044 # If there's data to follow, append it.
2045 if fileobj is not None:
2046 copyfileobj(fileobj, self.fileobj, tarinfo.size)
2047 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
2048 if remainder > 0:
2049 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2050 blocks += 1
2051 self.offset += blocks * BLOCKSIZE
2052
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002053 self.members.append(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002054
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002055 def extractall(self, path=".", members=None):
2056 """Extract all members from the archive to the current working
2057 directory and set owner, modification time and permissions on
2058 directories afterwards. `path' specifies a different directory
2059 to extract to. `members' is optional and must be a subset of the
2060 list returned by getmembers().
2061 """
2062 directories = []
2063
2064 if members is None:
2065 members = self
2066
2067 for tarinfo in members:
2068 if tarinfo.isdir():
Christian Heimes2202f872008-02-06 14:31:34 +00002069 # Extract directories with a safe mode.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002070 directories.append(tarinfo)
Christian Heimes2202f872008-02-06 14:31:34 +00002071 tarinfo = copy.copy(tarinfo)
2072 tarinfo.mode = 0o700
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002073 # Do not set_attrs directories, as we will do that further down
2074 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002075
2076 # Reverse sort directories.
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +00002077 directories.sort(key=lambda a: a.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002078 directories.reverse()
2079
2080 # Set correct owner, mtime and filemode on directories.
2081 for tarinfo in directories:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002082 dirpath = os.path.join(path, tarinfo.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002083 try:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002084 self.chown(tarinfo, dirpath)
2085 self.utime(tarinfo, dirpath)
2086 self.chmod(tarinfo, dirpath)
Guido van Rossumb940e112007-01-10 16:19:56 +00002087 except ExtractError as e:
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002088 if self.errorlevel > 1:
2089 raise
2090 else:
2091 self._dbg(1, "tarfile: %s" % e)
2092
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002093 def extract(self, member, path="", set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002094 """Extract a member from the archive to the current working directory,
2095 using its full name. Its file information is extracted as accurately
2096 as possible. `member' may be a filename or a TarInfo object. You can
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002097 specify a different directory using `path'. File attributes (owner,
2098 mtime, mode) are set unless `set_attrs' is False.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002099 """
2100 self._check("r")
2101
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002102 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002103 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002104 else:
2105 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002106
Neal Norwitza4f651a2004-07-20 22:07:44 +00002107 # Prepare the link target for makelink().
2108 if tarinfo.islnk():
2109 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2110
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002111 try:
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002112 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2113 set_attrs=set_attrs)
Guido van Rossumb940e112007-01-10 16:19:56 +00002114 except EnvironmentError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002115 if self.errorlevel > 0:
2116 raise
2117 else:
2118 if e.filename is None:
2119 self._dbg(1, "tarfile: %s" % e.strerror)
2120 else:
2121 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
Guido van Rossumb940e112007-01-10 16:19:56 +00002122 except ExtractError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002123 if self.errorlevel > 1:
2124 raise
2125 else:
2126 self._dbg(1, "tarfile: %s" % e)
2127
2128 def extractfile(self, member):
2129 """Extract a member from the archive as a file object. `member' may be
2130 a filename or a TarInfo object. If `member' is a regular file, a
2131 file-like object is returned. If `member' is a link, a file-like
2132 object is constructed from the link's target. If `member' is none of
2133 the above, None is returned.
2134 The file-like object is read-only and provides the following
2135 methods: read(), readline(), readlines(), seek() and tell()
2136 """
2137 self._check("r")
2138
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002139 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002140 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002141 else:
2142 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002143
2144 if tarinfo.isreg():
2145 return self.fileobject(self, tarinfo)
2146
2147 elif tarinfo.type not in SUPPORTED_TYPES:
2148 # If a member's type is unknown, it is treated as a
2149 # regular file.
2150 return self.fileobject(self, tarinfo)
2151
2152 elif tarinfo.islnk() or tarinfo.issym():
2153 if isinstance(self.fileobj, _Stream):
2154 # A small but ugly workaround for the case that someone tries
2155 # to extract a (sym)link as a file-object from a non-seekable
2156 # stream of tar blocks.
Thomas Wouters477c8d52006-05-27 19:21:47 +00002157 raise StreamError("cannot extract (sym)link as file object")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002158 else:
Georg Brandl7eb4b7d2005-07-22 21:49:32 +00002159 # A (sym)link's file object is its target's file object.
Lars Gustäbel1b512722010-06-03 12:45:16 +00002160 return self.extractfile(self._find_link_target(tarinfo))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002161 else:
2162 # If there's no data associated with the member (directory, chrdev,
2163 # blkdev, etc.), return None instead of a file object.
2164 return None
2165
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002166 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002167 """Extract the TarInfo object tarinfo to a physical
2168 file called targetpath.
2169 """
2170 # Fetch the TarInfo object for the given name
2171 # and build the destination pathname, replacing
2172 # forward slashes to platform specific separators.
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002173 targetpath = targetpath.rstrip("/")
2174 targetpath = targetpath.replace("/", os.sep)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002175
2176 # Create all upper directories.
2177 upperdirs = os.path.dirname(targetpath)
2178 if upperdirs and not os.path.exists(upperdirs):
Christian Heimes2202f872008-02-06 14:31:34 +00002179 # Create directories that are not part of the archive with
2180 # default permissions.
Thomas Woutersb2137042007-02-01 18:02:27 +00002181 os.makedirs(upperdirs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002182
2183 if tarinfo.islnk() or tarinfo.issym():
2184 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2185 else:
2186 self._dbg(1, tarinfo.name)
2187
2188 if tarinfo.isreg():
2189 self.makefile(tarinfo, targetpath)
2190 elif tarinfo.isdir():
2191 self.makedir(tarinfo, targetpath)
2192 elif tarinfo.isfifo():
2193 self.makefifo(tarinfo, targetpath)
2194 elif tarinfo.ischr() or tarinfo.isblk():
2195 self.makedev(tarinfo, targetpath)
2196 elif tarinfo.islnk() or tarinfo.issym():
2197 self.makelink(tarinfo, targetpath)
2198 elif tarinfo.type not in SUPPORTED_TYPES:
2199 self.makeunknown(tarinfo, targetpath)
2200 else:
2201 self.makefile(tarinfo, targetpath)
2202
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002203 if set_attrs:
2204 self.chown(tarinfo, targetpath)
2205 if not tarinfo.issym():
2206 self.chmod(tarinfo, targetpath)
2207 self.utime(tarinfo, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002208
2209 #--------------------------------------------------------------------------
2210 # Below are the different file methods. They are called via
2211 # _extract_member() when extract() is called. They can be replaced in a
2212 # subclass to implement other functionality.
2213
2214 def makedir(self, tarinfo, targetpath):
2215 """Make a directory called targetpath.
2216 """
2217 try:
Christian Heimes2202f872008-02-06 14:31:34 +00002218 # Use a safe mode for the directory, the real mode is set
2219 # later in _extract_member().
2220 os.mkdir(targetpath, 0o700)
Florent Xicluna68f71a32011-10-28 16:06:23 +02002221 except FileExistsError:
2222 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002223
2224 def makefile(self, tarinfo, targetpath):
2225 """Make a file called targetpath.
2226 """
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002227 source = self.fileobj
2228 source.seek(tarinfo.offset_data)
Guido van Rossume7ba4952007-06-06 23:52:48 +00002229 target = bltn_open(targetpath, "wb")
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002230 if tarinfo.sparse is not None:
2231 for offset, size in tarinfo.sparse:
2232 target.seek(offset)
2233 copyfileobj(source, target, size)
2234 else:
2235 copyfileobj(source, target, tarinfo.size)
2236 target.seek(tarinfo.size)
2237 target.truncate()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002238 target.close()
2239
2240 def makeunknown(self, tarinfo, targetpath):
2241 """Make a file from a TarInfo object with an unknown type
2242 at targetpath.
2243 """
2244 self.makefile(tarinfo, targetpath)
2245 self._dbg(1, "tarfile: Unknown file type %r, " \
2246 "extracted as regular file." % tarinfo.type)
2247
2248 def makefifo(self, tarinfo, targetpath):
2249 """Make a fifo called targetpath.
2250 """
2251 if hasattr(os, "mkfifo"):
2252 os.mkfifo(targetpath)
2253 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002254 raise ExtractError("fifo not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002255
2256 def makedev(self, tarinfo, targetpath):
2257 """Make a character or block device called targetpath.
2258 """
2259 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00002260 raise ExtractError("special devices not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002261
2262 mode = tarinfo.mode
2263 if tarinfo.isblk():
2264 mode |= stat.S_IFBLK
2265 else:
2266 mode |= stat.S_IFCHR
2267
2268 os.mknod(targetpath, mode,
2269 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2270
2271 def makelink(self, tarinfo, targetpath):
2272 """Make a (symbolic) link called targetpath. If it cannot be created
2273 (platform limitation), we try to make a copy of the referenced file
2274 instead of a link.
2275 """
Brian Curtind40e6f72010-07-08 21:39:08 +00002276 try:
Lars Gustäbel1b512722010-06-03 12:45:16 +00002277 # For systems that support symbolic and hard links.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002278 if tarinfo.issym():
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002279 os.symlink(tarinfo.linkname, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002280 else:
Neal Norwitza4f651a2004-07-20 22:07:44 +00002281 # See extract().
Lars Gustäbel1b512722010-06-03 12:45:16 +00002282 if os.path.exists(tarinfo._link_target):
2283 os.link(tarinfo._link_target, targetpath)
2284 else:
Brian Curtin82df53e2010-09-24 21:04:05 +00002285 self._extract_member(self._find_link_target(tarinfo),
2286 targetpath)
Brian Curtin16633fa2010-07-09 13:54:27 +00002287 except symlink_exception:
Brian Curtind40e6f72010-07-08 21:39:08 +00002288 if tarinfo.issym():
Brian Curtin16633fa2010-07-09 13:54:27 +00002289 linkpath = os.path.join(os.path.dirname(tarinfo.name),
2290 tarinfo.linkname)
Brian Curtind40e6f72010-07-08 21:39:08 +00002291 else:
2292 linkpath = tarinfo.linkname
Lars Gustäbel1b512722010-06-03 12:45:16 +00002293 else:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002294 try:
Brian Curtin16633fa2010-07-09 13:54:27 +00002295 self._extract_member(self._find_link_target(tarinfo),
2296 targetpath)
Lars Gustäbel1b512722010-06-03 12:45:16 +00002297 except KeyError:
2298 raise ExtractError("unable to resolve link inside archive")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002299
2300 def chown(self, tarinfo, targetpath):
2301 """Set owner of targetpath according to tarinfo.
2302 """
2303 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2304 # We have to be root to do so.
2305 try:
2306 g = grp.getgrnam(tarinfo.gname)[2]
2307 except KeyError:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002308 g = tarinfo.gid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002309 try:
2310 u = pwd.getpwnam(tarinfo.uname)[2]
2311 except KeyError:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002312 u = tarinfo.uid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002313 try:
2314 if tarinfo.issym() and hasattr(os, "lchown"):
2315 os.lchown(targetpath, u, g)
2316 else:
Andrew MacIntyre7970d202003-02-19 12:51:34 +00002317 if sys.platform != "os2emx":
2318 os.chown(targetpath, u, g)
Guido van Rossumb940e112007-01-10 16:19:56 +00002319 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002320 raise ExtractError("could not change owner")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002321
2322 def chmod(self, tarinfo, targetpath):
2323 """Set file permissions of targetpath according to tarinfo.
2324 """
Jack Jansen834eff62003-03-07 12:47:06 +00002325 if hasattr(os, 'chmod'):
2326 try:
2327 os.chmod(targetpath, tarinfo.mode)
Guido van Rossumb940e112007-01-10 16:19:56 +00002328 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002329 raise ExtractError("could not change mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002330
2331 def utime(self, tarinfo, targetpath):
2332 """Set modification time of targetpath according to tarinfo.
2333 """
Jack Jansen834eff62003-03-07 12:47:06 +00002334 if not hasattr(os, 'utime'):
Tim Petersf9347782003-03-07 15:36:41 +00002335 return
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002336 try:
2337 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
Guido van Rossumb940e112007-01-10 16:19:56 +00002338 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002339 raise ExtractError("could not change modification time")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002340
2341 #--------------------------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002342 def next(self):
2343 """Return the next member of the archive as a TarInfo object, when
2344 TarFile is opened for reading. Return None if there is no more
2345 available.
2346 """
2347 self._check("ra")
2348 if self.firstmember is not None:
2349 m = self.firstmember
2350 self.firstmember = None
2351 return m
2352
2353 # Read the next block.
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002354 self.fileobj.seek(self.offset)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002355 tarinfo = None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002356 while True:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002357 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00002358 tarinfo = self.tarinfo.fromtarfile(self)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002359 except EOFHeaderError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002360 if self.ignore_zeros:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002361 self._dbg(2, "0x%X: %s" % (self.offset, e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002362 self.offset += BLOCKSIZE
2363 continue
Lars Gustäbel9520a432009-11-22 18:48:49 +00002364 except InvalidHeaderError as e:
2365 if self.ignore_zeros:
2366 self._dbg(2, "0x%X: %s" % (self.offset, e))
2367 self.offset += BLOCKSIZE
2368 continue
2369 elif self.offset == 0:
2370 raise ReadError(str(e))
2371 except EmptyHeaderError:
2372 if self.offset == 0:
2373 raise ReadError("empty file")
2374 except TruncatedHeaderError as e:
2375 if self.offset == 0:
2376 raise ReadError(str(e))
2377 except SubsequentHeaderError as e:
2378 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002379 break
2380
Lars Gustäbel9520a432009-11-22 18:48:49 +00002381 if tarinfo is not None:
2382 self.members.append(tarinfo)
2383 else:
2384 self._loaded = True
2385
Thomas Wouters477c8d52006-05-27 19:21:47 +00002386 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002387
2388 #--------------------------------------------------------------------------
2389 # Little helper methods:
2390
Lars Gustäbel1b512722010-06-03 12:45:16 +00002391 def _getmember(self, name, tarinfo=None, normalize=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002392 """Find an archive member by name from bottom to top.
2393 If tarinfo is given, it is used as the starting point.
2394 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002395 # Ensure that all members have been loaded.
2396 members = self.getmembers()
2397
Lars Gustäbel1b512722010-06-03 12:45:16 +00002398 # Limit the member search list up to tarinfo.
2399 if tarinfo is not None:
2400 members = members[:members.index(tarinfo)]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002401
Lars Gustäbel1b512722010-06-03 12:45:16 +00002402 if normalize:
2403 name = os.path.normpath(name)
2404
2405 for member in reversed(members):
2406 if normalize:
2407 member_name = os.path.normpath(member.name)
2408 else:
2409 member_name = member.name
2410
2411 if name == member_name:
2412 return member
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002413
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002414 def _load(self):
2415 """Read through the entire archive file and look for readable
2416 members.
2417 """
2418 while True:
2419 tarinfo = self.next()
2420 if tarinfo is None:
2421 break
2422 self._loaded = True
2423
2424 def _check(self, mode=None):
2425 """Check if TarFile is still open, and if the operation's mode
2426 corresponds to TarFile's mode.
2427 """
2428 if self.closed:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002429 raise IOError("%s is closed" % self.__class__.__name__)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002430 if mode is not None and self.mode not in mode:
2431 raise IOError("bad operation for mode %r" % self.mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002432
Lars Gustäbel1b512722010-06-03 12:45:16 +00002433 def _find_link_target(self, tarinfo):
2434 """Find the target member of a symlink or hardlink member in the
2435 archive.
2436 """
2437 if tarinfo.issym():
2438 # Always search the entire archive.
2439 linkname = os.path.dirname(tarinfo.name) + "/" + tarinfo.linkname
2440 limit = None
2441 else:
2442 # Search the archive before the link, because a hard link is
2443 # just a reference to an already archived file.
2444 linkname = tarinfo.linkname
2445 limit = tarinfo
2446
2447 member = self._getmember(linkname, tarinfo=limit, normalize=True)
2448 if member is None:
2449 raise KeyError("linkname %r not found" % linkname)
2450 return member
2451
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002452 def __iter__(self):
2453 """Provide an iterator object.
2454 """
2455 if self._loaded:
2456 return iter(self.members)
2457 else:
2458 return TarIter(self)
2459
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002460 def _dbg(self, level, msg):
2461 """Write debugging output to sys.stderr.
2462 """
2463 if level <= self.debug:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002464 print(msg, file=sys.stderr)
Lars Gustäbel01385812010-03-03 12:08:54 +00002465
2466 def __enter__(self):
2467 self._check()
2468 return self
2469
2470 def __exit__(self, type, value, traceback):
2471 if type is None:
2472 self.close()
2473 else:
2474 # An exception occurred. We must not call close() because
2475 # it would try to write end-of-archive blocks and padding.
2476 if not self._extfileobj:
2477 self.fileobj.close()
2478 self.closed = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002479# class TarFile
2480
2481class TarIter:
2482 """Iterator Class.
2483
2484 for tarinfo in TarFile(...):
2485 suite...
2486 """
2487
2488 def __init__(self, tarfile):
2489 """Construct a TarIter object.
2490 """
2491 self.tarfile = tarfile
Martin v. Löwis637431b2005-03-03 23:12:42 +00002492 self.index = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002493 def __iter__(self):
2494 """Return iterator object.
2495 """
2496 return self
Georg Brandla18af4e2007-04-21 15:47:16 +00002497 def __next__(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002498 """Return the next item using TarFile's next() method.
2499 When all members have been read, set TarFile as _loaded.
2500 """
Martin v. Löwis637431b2005-03-03 23:12:42 +00002501 # Fix for SF #1100429: Under rare circumstances it can
2502 # happen that getmembers() is called during iteration,
2503 # which will cause TarIter to stop prematurely.
2504 if not self.tarfile._loaded:
2505 tarinfo = self.tarfile.next()
2506 if not tarinfo:
2507 self.tarfile._loaded = True
2508 raise StopIteration
2509 else:
2510 try:
2511 tarinfo = self.tarfile.members[self.index]
2512 except IndexError:
2513 raise StopIteration
2514 self.index += 1
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002515 return tarinfo
2516
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002517#--------------------
2518# exported functions
2519#--------------------
2520def is_tarfile(name):
2521 """Return True if name points to a tar archive that we
2522 are able to handle, else return False.
2523 """
2524 try:
2525 t = open(name)
2526 t.close()
2527 return True
2528 except TarError:
2529 return False
2530
Guido van Rossume7ba4952007-06-06 23:52:48 +00002531bltn_open = open
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002532open = TarFile.open