blob: cf4e1643f897837e1ca4b52851b35a889f05067b [file] [log] [blame]
Benjamin Peterson90f5ba52010-03-11 22:53:45 +00001#!/usr/bin/env python3
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
Christian Heimes9c1257e2007-11-04 11:37:22 +00005# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00006# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
Guido van Rossumd8faa362007-04-27 19:54:29 +000032version = "0.9.0"
Guido van Rossum98297ee2007-11-06 21:34:58 +000033__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)"
Senthil Kumaran7c9719c2011-07-28 22:32:49 +080034__date__ = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $"
35__cvsid__ = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $"
Guido van Rossum98297ee2007-11-06 21:34:58 +000036__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000037
38#---------
39# Imports
40#---------
41import sys
42import os
43import shutil
44import stat
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000045import time
46import struct
Thomas Wouters89f507f2006-12-13 04:49:30 +000047import copy
Guido van Rossumd8faa362007-04-27 19:54:29 +000048import re
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000049
50try:
51 import grp, pwd
52except ImportError:
53 grp = pwd = None
54
Brian Curtin16633fa2010-07-09 13:54:27 +000055# os.symlink on Windows prior to 6.0 raises NotImplementedError
56symlink_exception = (AttributeError, NotImplementedError)
57try:
58 # WindowsError (1314) will be raised if the caller does not hold the
59 # SeCreateSymbolicLinkPrivilege privilege
60 symlink_exception += (WindowsError,)
61except NameError:
62 pass
63
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000064# from tarfile import *
65__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
66
Georg Brandl1a3284e2007-12-02 09:40:06 +000067from builtins import open as _open # Since 'open' is TarFile.open
Guido van Rossum8f78fe92006-08-24 04:03:53 +000068
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000069#---------------------------------------------------------
70# tar constants
71#---------------------------------------------------------
Lars Gustäbelb506dc32007-08-07 18:36:16 +000072NUL = b"\0" # the null character
Guido van Rossumd8faa362007-04-27 19:54:29 +000073BLOCKSIZE = 512 # length of processing blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000074RECORDSIZE = BLOCKSIZE * 20 # length of records
Lars Gustäbelb506dc32007-08-07 18:36:16 +000075GNU_MAGIC = b"ustar \0" # magic gnu tar string
76POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000077
Guido van Rossumd8faa362007-04-27 19:54:29 +000078LENGTH_NAME = 100 # maximum length of a filename
79LENGTH_LINK = 100 # maximum length of a linkname
80LENGTH_PREFIX = 155 # maximum length of the prefix field
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000081
Lars Gustäbelb506dc32007-08-07 18:36:16 +000082REGTYPE = b"0" # regular file
83AREGTYPE = b"\0" # regular file
84LNKTYPE = b"1" # link (inside tarfile)
85SYMTYPE = b"2" # symbolic link
86CHRTYPE = b"3" # character special device
87BLKTYPE = b"4" # block special device
88DIRTYPE = b"5" # directory
89FIFOTYPE = b"6" # fifo special device
90CONTTYPE = b"7" # contiguous file
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000091
Lars Gustäbelb506dc32007-08-07 18:36:16 +000092GNUTYPE_LONGNAME = b"L" # GNU tar longname
93GNUTYPE_LONGLINK = b"K" # GNU tar longlink
94GNUTYPE_SPARSE = b"S" # GNU tar sparse file
Guido van Rossumd8faa362007-04-27 19:54:29 +000095
Lars Gustäbelb506dc32007-08-07 18:36:16 +000096XHDTYPE = b"x" # POSIX.1-2001 extended header
97XGLTYPE = b"g" # POSIX.1-2001 global header
98SOLARIS_XHDTYPE = b"X" # Solaris extended header
Guido van Rossumd8faa362007-04-27 19:54:29 +000099
100USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
101GNU_FORMAT = 1 # GNU tar format
102PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
103DEFAULT_FORMAT = GNU_FORMAT
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000104
105#---------------------------------------------------------
106# tarfile constants
107#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000108# File types that tarfile supports:
109SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
110 SYMTYPE, DIRTYPE, FIFOTYPE,
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000111 CONTTYPE, CHRTYPE, BLKTYPE,
112 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
113 GNUTYPE_SPARSE)
114
Guido van Rossumd8faa362007-04-27 19:54:29 +0000115# File types that will be treated as a regular file.
116REGULAR_TYPES = (REGTYPE, AREGTYPE,
117 CONTTYPE, GNUTYPE_SPARSE)
118
119# File types that are part of the GNU tar format.
120GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
121 GNUTYPE_SPARSE)
122
123# Fields from a pax header that override a TarInfo attribute.
124PAX_FIELDS = ("path", "linkpath", "size", "mtime",
125 "uid", "gid", "uname", "gname")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000126
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000127# Fields from a pax header that are affected by hdrcharset.
128PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
129
Guido van Rossume7ba4952007-06-06 23:52:48 +0000130# Fields in a pax header that are numbers, all other fields
131# are treated as strings.
132PAX_NUMBER_FIELDS = {
133 "atime": float,
134 "ctime": float,
135 "mtime": float,
136 "uid": int,
137 "gid": int,
138 "size": int
139}
140
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000141#---------------------------------------------------------
142# Bits used in the mode field, values in octal.
143#---------------------------------------------------------
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000144S_IFLNK = 0o120000 # symbolic link
145S_IFREG = 0o100000 # regular file
146S_IFBLK = 0o060000 # block device
147S_IFDIR = 0o040000 # directory
148S_IFCHR = 0o020000 # character device
149S_IFIFO = 0o010000 # fifo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000150
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000151TSUID = 0o4000 # set UID on execution
152TSGID = 0o2000 # set GID on execution
153TSVTX = 0o1000 # reserved
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000154
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000155TUREAD = 0o400 # read by owner
156TUWRITE = 0o200 # write by owner
157TUEXEC = 0o100 # execute/search by owner
158TGREAD = 0o040 # read by group
159TGWRITE = 0o020 # write by group
160TGEXEC = 0o010 # execute/search by group
161TOREAD = 0o004 # read by other
162TOWRITE = 0o002 # write by other
163TOEXEC = 0o001 # execute/search by other
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000164
165#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000166# initialization
167#---------------------------------------------------------
Victor Stinner0f35e2c2010-06-11 23:46:47 +0000168if os.name in ("nt", "ce"):
169 ENCODING = "utf-8"
170else:
171 ENCODING = sys.getfilesystemencoding()
Guido van Rossumd8faa362007-04-27 19:54:29 +0000172
173#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000174# Some useful functions
175#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000176
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000177def stn(s, length, encoding, errors):
178 """Convert a string to a null-terminated bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000179 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000180 s = s.encode(encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000181 return s[:length] + (length - len(s)) * NUL
Thomas Wouters477c8d52006-05-27 19:21:47 +0000182
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000183def nts(s, encoding, errors):
184 """Convert a null-terminated bytes object to a string.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000185 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000186 p = s.find(b"\0")
187 if p != -1:
188 s = s[:p]
189 return s.decode(encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000190
Thomas Wouters477c8d52006-05-27 19:21:47 +0000191def nti(s):
192 """Convert a number field to a python number.
193 """
194 # There are two possible encodings for a number field, see
195 # itn() below.
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200196 if s[0] in (0o200, 0o377):
197 n = 0
198 for i in range(len(s) - 1):
199 n <<= 8
200 n += s[i + 1]
201 if s[0] == 0o377:
202 n = -(256 ** (len(s) - 1) - n)
203 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000204 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000205 n = int(nts(s, "ascii", "strict") or "0", 8)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000206 except ValueError:
Lars Gustäbel9520a432009-11-22 18:48:49 +0000207 raise InvalidHeaderError("invalid header")
Thomas Wouters477c8d52006-05-27 19:21:47 +0000208 return n
209
Guido van Rossumd8faa362007-04-27 19:54:29 +0000210def itn(n, digits=8, format=DEFAULT_FORMAT):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000211 """Convert a python number to a number field.
212 """
213 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
214 # octal digits followed by a null-byte, this allows values up to
215 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200216 # that if necessary. A leading 0o200 or 0o377 byte indicate this
217 # particular encoding, the following digits-1 bytes are a big-endian
218 # base-256 representation. This allows values up to (256**(digits-1))-1.
219 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
220 # number.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221 if 0 <= n < 8 ** (digits - 1):
Lars Gustäbela280ca752007-08-28 07:34:33 +0000222 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200223 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
224 if n >= 0:
225 s = bytearray([0o200])
226 else:
227 s = bytearray([0o377])
228 n = 256 ** digits + n
Thomas Wouters477c8d52006-05-27 19:21:47 +0000229
Guido van Rossum805365e2007-05-07 22:24:25 +0000230 for i in range(digits - 1):
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200231 s.insert(1, n & 0o377)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 n >>= 8
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200233 else:
234 raise ValueError("overflow in number field")
235
Thomas Wouters477c8d52006-05-27 19:21:47 +0000236 return s
237
238def calc_chksums(buf):
239 """Calculate the checksum for a member's header by summing up all
240 characters except for the chksum field which is treated as if
241 it was filled with spaces. According to the GNU tar sources,
242 some tars (Sun and NeXT) calculate chksum with signed char,
243 which will be different if there are chars in the buffer with
244 the high bit set. So we calculate two checksums, unsigned and
245 signed.
246 """
247 unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
248 signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
249 return unsigned_chksum, signed_chksum
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000250
251def copyfileobj(src, dst, length=None):
252 """Copy length bytes from fileobj src to fileobj dst.
253 If length is None, copy the entire content.
254 """
255 if length == 0:
256 return
257 if length is None:
258 shutil.copyfileobj(src, dst)
259 return
260
261 BUFSIZE = 16 * 1024
262 blocks, remainder = divmod(length, BUFSIZE)
Guido van Rossum805365e2007-05-07 22:24:25 +0000263 for b in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000264 buf = src.read(BUFSIZE)
265 if len(buf) < BUFSIZE:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000267 dst.write(buf)
268
269 if remainder != 0:
270 buf = src.read(remainder)
271 if len(buf) < remainder:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000273 dst.write(buf)
274 return
275
276filemode_table = (
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000277 ((S_IFLNK, "l"),
278 (S_IFREG, "-"),
279 (S_IFBLK, "b"),
280 (S_IFDIR, "d"),
281 (S_IFCHR, "c"),
282 (S_IFIFO, "p")),
283
284 ((TUREAD, "r"),),
285 ((TUWRITE, "w"),),
286 ((TUEXEC|TSUID, "s"),
287 (TSUID, "S"),
288 (TUEXEC, "x")),
289
290 ((TGREAD, "r"),),
291 ((TGWRITE, "w"),),
292 ((TGEXEC|TSGID, "s"),
293 (TSGID, "S"),
294 (TGEXEC, "x")),
295
296 ((TOREAD, "r"),),
297 ((TOWRITE, "w"),),
298 ((TOEXEC|TSVTX, "t"),
299 (TSVTX, "T"),
300 (TOEXEC, "x"))
301)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000302
303def filemode(mode):
304 """Convert a file's mode to a string of the form
305 -rwxrwxrwx.
306 Used by TarFile.list()
307 """
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000308 perm = []
309 for table in filemode_table:
310 for bit, char in table:
311 if mode & bit == bit:
312 perm.append(char)
313 break
314 else:
315 perm.append("-")
316 return "".join(perm)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000317
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000318class TarError(Exception):
319 """Base exception."""
320 pass
321class ExtractError(TarError):
322 """General exception for extract errors."""
323 pass
324class ReadError(TarError):
325 """Exception for unreadble tar archives."""
326 pass
327class CompressionError(TarError):
328 """Exception for unavailable compression methods."""
329 pass
330class StreamError(TarError):
331 """Exception for unsupported operations on stream-like TarFiles."""
332 pass
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000333class HeaderError(TarError):
Lars Gustäbel9520a432009-11-22 18:48:49 +0000334 """Base exception for header errors."""
335 pass
336class EmptyHeaderError(HeaderError):
337 """Exception for empty headers."""
338 pass
339class TruncatedHeaderError(HeaderError):
340 """Exception for truncated headers."""
341 pass
342class EOFHeaderError(HeaderError):
343 """Exception for end of file headers."""
344 pass
345class InvalidHeaderError(HeaderError):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000346 """Exception for invalid headers."""
347 pass
Lars Gustäbel9520a432009-11-22 18:48:49 +0000348class SubsequentHeaderError(HeaderError):
349 """Exception for missing and invalid extended headers."""
350 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000351
352#---------------------------
353# internal stream interface
354#---------------------------
355class _LowLevelFile:
356 """Low-level file object. Supports reading and writing.
357 It is used instead of a regular file object for streaming
358 access.
359 """
360
361 def __init__(self, name, mode):
362 mode = {
363 "r": os.O_RDONLY,
364 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
365 }[mode]
366 if hasattr(os, "O_BINARY"):
367 mode |= os.O_BINARY
Lars Gustäbeld6eb70b2010-04-29 15:37:02 +0000368 self.fd = os.open(name, mode, 0o666)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000369
370 def close(self):
371 os.close(self.fd)
372
373 def read(self, size):
374 return os.read(self.fd, size)
375
376 def write(self, s):
377 os.write(self.fd, s)
378
379class _Stream:
380 """Class that serves as an adapter between TarFile and
381 a stream-like object. The stream-like object only
382 needs to have a read() or write() method and is accessed
383 blockwise. Use of gzip or bzip2 compression is possible.
384 A stream-like object could be for example: sys.stdin,
385 sys.stdout, a socket, a tape device etc.
386
387 _Stream is intended to be used only internally.
388 """
389
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000390 def __init__(self, name, mode, comptype, fileobj, bufsize):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000391 """Construct a _Stream object.
392 """
393 self._extfileobj = True
394 if fileobj is None:
395 fileobj = _LowLevelFile(name, mode)
396 self._extfileobj = False
397
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000398 if comptype == '*':
399 # Enable transparent compression detection for the
400 # stream interface
401 fileobj = _StreamProxy(fileobj)
402 comptype = fileobj.getcomptype()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000403
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000404 self.name = name or ""
405 self.mode = mode
406 self.comptype = comptype
407 self.fileobj = fileobj
408 self.bufsize = bufsize
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000409 self.buf = b""
Guido van Rossume2a383d2007-01-15 16:59:06 +0000410 self.pos = 0
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000411 self.closed = False
412
Antoine Pitrou605c2932010-09-23 20:15:14 +0000413 try:
414 if comptype == "gz":
415 try:
416 import zlib
417 except ImportError:
418 raise CompressionError("zlib module is not available")
419 self.zlib = zlib
420 self.crc = zlib.crc32(b"")
421 if mode == "r":
422 self._init_read_gz()
423 else:
424 self._init_write_gz()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000425
Antoine Pitrou605c2932010-09-23 20:15:14 +0000426 if comptype == "bz2":
427 try:
428 import bz2
429 except ImportError:
430 raise CompressionError("bz2 module is not available")
431 if mode == "r":
432 self.dbuf = b""
433 self.cmp = bz2.BZ2Decompressor()
434 else:
435 self.cmp = bz2.BZ2Compressor()
436 except:
437 if not self._extfileobj:
438 self.fileobj.close()
439 self.closed = True
440 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000441
442 def __del__(self):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000443 if hasattr(self, "closed") and not self.closed:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000444 self.close()
445
446 def _init_write_gz(self):
447 """Initialize for writing with gzip compression.
448 """
449 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
450 -self.zlib.MAX_WBITS,
451 self.zlib.DEF_MEM_LEVEL,
452 0)
Guido van Rossume2a383d2007-01-15 16:59:06 +0000453 timestamp = struct.pack("<L", int(time.time()))
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000454 self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000455 if self.name.endswith(".gz"):
456 self.name = self.name[:-3]
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000457 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
458 self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000459
460 def write(self, s):
461 """Write string s to the stream.
462 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000463 if self.comptype == "gz":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000464 self.crc = self.zlib.crc32(s, self.crc)
465 self.pos += len(s)
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000466 if self.comptype != "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000467 s = self.cmp.compress(s)
468 self.__write(s)
469
470 def __write(self, s):
471 """Write string s to the stream if a whole new block
472 is ready to be written.
473 """
474 self.buf += s
475 while len(self.buf) > self.bufsize:
476 self.fileobj.write(self.buf[:self.bufsize])
477 self.buf = self.buf[self.bufsize:]
478
479 def close(self):
480 """Close the _Stream object. No operation should be
481 done on it afterwards.
482 """
483 if self.closed:
484 return
485
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000486 if self.mode == "w" and self.comptype != "tar":
Martin v. Löwisc234a522004-08-22 21:28:33 +0000487 self.buf += self.cmp.flush()
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000488
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000489 if self.mode == "w" and self.buf:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000490 self.fileobj.write(self.buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000491 self.buf = b""
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000492 if self.comptype == "gz":
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000493 # The native zlib crc is an unsigned 32-bit integer, but
494 # the Python wrapper implicitly casts that to a signed C
495 # long. So, on a 32-bit box self.crc may "look negative",
496 # while the same crc on a 64-bit box may "look positive".
497 # To avoid irksome warnings from the `struct` module, force
498 # it to look positive on all boxes.
Guido van Rossume2a383d2007-01-15 16:59:06 +0000499 self.fileobj.write(struct.pack("<L", self.crc & 0xffffffff))
500 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000501
502 if not self._extfileobj:
503 self.fileobj.close()
504
505 self.closed = True
506
507 def _init_read_gz(self):
508 """Initialize for reading a gzip compressed fileobj.
509 """
510 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000511 self.dbuf = b""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000512
513 # taken from gzip.GzipFile with some alterations
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000514 if self.__read(2) != b"\037\213":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000515 raise ReadError("not a gzip file")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000516 if self.__read(1) != b"\010":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000517 raise CompressionError("unsupported compression method")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000518
519 flag = ord(self.__read(1))
520 self.__read(6)
521
522 if flag & 4:
523 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
524 self.read(xlen)
525 if flag & 8:
526 while True:
527 s = self.__read(1)
528 if not s or s == NUL:
529 break
530 if flag & 16:
531 while True:
532 s = self.__read(1)
533 if not s or s == NUL:
534 break
535 if flag & 2:
536 self.__read(2)
537
538 def tell(self):
539 """Return the stream's file pointer position.
540 """
541 return self.pos
542
543 def seek(self, pos=0):
544 """Set the stream's file pointer to pos. Negative seeking
545 is forbidden.
546 """
547 if pos - self.pos >= 0:
548 blocks, remainder = divmod(pos - self.pos, self.bufsize)
Guido van Rossum805365e2007-05-07 22:24:25 +0000549 for i in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000550 self.read(self.bufsize)
551 self.read(remainder)
552 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553 raise StreamError("seeking backwards is not allowed")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000554 return self.pos
555
556 def read(self, size=None):
557 """Return the next size number of bytes from the stream.
558 If size is not defined, return all bytes of the stream
559 up to EOF.
560 """
561 if size is None:
562 t = []
563 while True:
564 buf = self._read(self.bufsize)
565 if not buf:
566 break
567 t.append(buf)
568 buf = "".join(t)
569 else:
570 buf = self._read(size)
571 self.pos += len(buf)
572 return buf
573
574 def _read(self, size):
575 """Return size bytes from the stream.
576 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000577 if self.comptype == "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000578 return self.__read(size)
579
580 c = len(self.dbuf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000581 while c < size:
582 buf = self.__read(self.bufsize)
583 if not buf:
584 break
Guido van Rossumd8faa362007-04-27 19:54:29 +0000585 try:
586 buf = self.cmp.decompress(buf)
587 except IOError:
588 raise ReadError("invalid compressed data")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000589 self.dbuf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000590 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000591 buf = self.dbuf[:size]
592 self.dbuf = self.dbuf[size:]
593 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000594
595 def __read(self, size):
596 """Return size bytes from stream. If internal buffer is empty,
597 read another block from the stream.
598 """
599 c = len(self.buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000600 while c < size:
601 buf = self.fileobj.read(self.bufsize)
602 if not buf:
603 break
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000604 self.buf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000605 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000606 buf = self.buf[:size]
607 self.buf = self.buf[size:]
608 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000609# class _Stream
610
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000611class _StreamProxy(object):
612 """Small proxy class that enables transparent compression
613 detection for the Stream interface (mode 'r|*').
614 """
615
616 def __init__(self, fileobj):
617 self.fileobj = fileobj
618 self.buf = self.fileobj.read(BLOCKSIZE)
619
620 def read(self, size):
621 self.read = self.fileobj.read
622 return self.buf
623
624 def getcomptype(self):
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000625 if self.buf.startswith(b"\037\213\010"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000626 return "gz"
Lars Gustäbela280ca752007-08-28 07:34:33 +0000627 if self.buf.startswith(b"BZh91"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000628 return "bz2"
629 return "tar"
630
631 def close(self):
632 self.fileobj.close()
633# class StreamProxy
634
Thomas Wouters477c8d52006-05-27 19:21:47 +0000635class _BZ2Proxy(object):
636 """Small proxy class that enables external file object
637 support for "r:bz2" and "w:bz2" modes. This is actually
638 a workaround for a limitation in bz2 module's BZ2File
639 class which (unlike gzip.GzipFile) has no support for
640 a file object argument.
641 """
642
643 blocksize = 16 * 1024
644
645 def __init__(self, fileobj, mode):
646 self.fileobj = fileobj
647 self.mode = mode
Guido van Rossumd8faa362007-04-27 19:54:29 +0000648 self.name = getattr(self.fileobj, "name", None)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000649 self.init()
650
651 def init(self):
652 import bz2
653 self.pos = 0
654 if self.mode == "r":
655 self.bz2obj = bz2.BZ2Decompressor()
656 self.fileobj.seek(0)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000657 self.buf = b""
Thomas Wouters477c8d52006-05-27 19:21:47 +0000658 else:
659 self.bz2obj = bz2.BZ2Compressor()
660
661 def read(self, size):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000662 x = len(self.buf)
663 while x < size:
Lars Gustäbel42e00912009-03-22 20:34:29 +0000664 raw = self.fileobj.read(self.blocksize)
665 if not raw:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000666 break
Lars Gustäbel42e00912009-03-22 20:34:29 +0000667 data = self.bz2obj.decompress(raw)
668 self.buf += data
Thomas Wouters477c8d52006-05-27 19:21:47 +0000669 x += len(data)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000670
671 buf = self.buf[:size]
672 self.buf = self.buf[size:]
673 self.pos += len(buf)
674 return buf
675
676 def seek(self, pos):
677 if pos < self.pos:
678 self.init()
679 self.read(pos - self.pos)
680
681 def tell(self):
682 return self.pos
683
684 def write(self, data):
685 self.pos += len(data)
686 raw = self.bz2obj.compress(data)
687 self.fileobj.write(raw)
688
689 def close(self):
690 if self.mode == "w":
691 raw = self.bz2obj.flush()
692 self.fileobj.write(raw)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000693# class _BZ2Proxy
694
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000695#------------------------
696# Extraction file object
697#------------------------
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000698class _FileInFile(object):
699 """A thin wrapper around an existing file object that
700 provides a part of its data as an individual file
701 object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000702 """
703
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000704 def __init__(self, fileobj, offset, size, blockinfo=None):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000705 self.fileobj = fileobj
706 self.offset = offset
707 self.size = size
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000708 self.position = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000709
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000710 if blockinfo is None:
711 blockinfo = [(0, size)]
712
713 # Construct a map with data and zero blocks.
714 self.map_index = 0
715 self.map = []
716 lastpos = 0
717 realpos = self.offset
718 for offset, size in blockinfo:
719 if offset > lastpos:
720 self.map.append((False, lastpos, offset, None))
721 self.map.append((True, offset, offset + size, realpos))
722 realpos += size
723 lastpos = offset + size
724 if lastpos < self.size:
725 self.map.append((False, lastpos, self.size, None))
726
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000727 def seekable(self):
728 if not hasattr(self.fileobj, "seekable"):
729 # XXX gzip.GzipFile and bz2.BZ2File
730 return True
731 return self.fileobj.seekable()
732
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000733 def tell(self):
734 """Return the current file position.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000735 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000736 return self.position
737
738 def seek(self, position):
739 """Seek to a position in the file.
740 """
741 self.position = position
742
743 def read(self, size=None):
744 """Read data from the file.
745 """
746 if size is None:
747 size = self.size - self.position
748 else:
749 size = min(size, self.size - self.position)
750
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000751 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000752 while size > 0:
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000753 while True:
754 data, start, stop, offset = self.map[self.map_index]
755 if start <= self.position < stop:
756 break
757 else:
758 self.map_index += 1
759 if self.map_index == len(self.map):
760 self.map_index = 0
761 length = min(size, stop - self.position)
762 if data:
Lars Gustäbeldd071042011-02-23 11:42:22 +0000763 self.fileobj.seek(offset + (self.position - start))
764 buf += self.fileobj.read(length)
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000765 else:
766 buf += NUL * length
767 size -= length
768 self.position += length
769 return buf
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000770#class _FileInFile
771
772
773class ExFileObject(object):
774 """File-like object for reading an archive member.
775 Is returned by TarFile.extractfile().
776 """
777 blocksize = 1024
778
779 def __init__(self, tarfile, tarinfo):
780 self.fileobj = _FileInFile(tarfile.fileobj,
781 tarinfo.offset_data,
782 tarinfo.size,
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000783 tarinfo.sparse)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000784 self.name = tarinfo.name
785 self.mode = "r"
786 self.closed = False
787 self.size = tarinfo.size
788
789 self.position = 0
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000790 self.buffer = b""
791
792 def readable(self):
793 return True
794
795 def writable(self):
796 return False
797
798 def seekable(self):
799 return self.fileobj.seekable()
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000800
801 def read(self, size=None):
802 """Read at most size bytes from the file. If size is not
803 present or None, read all data until EOF is reached.
804 """
805 if self.closed:
806 raise ValueError("I/O operation on closed file")
807
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000808 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000809 if self.buffer:
810 if size is None:
811 buf = self.buffer
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000812 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000813 else:
814 buf = self.buffer[:size]
815 self.buffer = self.buffer[size:]
816
817 if size is None:
818 buf += self.fileobj.read()
819 else:
820 buf += self.fileobj.read(size - len(buf))
821
822 self.position += len(buf)
823 return buf
824
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000825 # XXX TextIOWrapper uses the read1() method.
826 read1 = read
827
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000828 def readline(self, size=-1):
829 """Read one entire line from the file. If size is present
830 and non-negative, return a string with at most that
831 size, which may be an incomplete line.
832 """
833 if self.closed:
834 raise ValueError("I/O operation on closed file")
835
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000836 pos = self.buffer.find(b"\n") + 1
837 if pos == 0:
838 # no newline found.
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000839 while True:
840 buf = self.fileobj.read(self.blocksize)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000841 self.buffer += buf
842 if not buf or b"\n" in buf:
843 pos = self.buffer.find(b"\n") + 1
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000844 if pos == 0:
845 # no newline found.
846 pos = len(self.buffer)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000847 break
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000848
849 if size != -1:
850 pos = min(size, pos)
851
852 buf = self.buffer[:pos]
853 self.buffer = self.buffer[pos:]
854 self.position += len(buf)
855 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000856
857 def readlines(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000858 """Return a list with all remaining lines.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000859 """
860 result = []
861 while True:
862 line = self.readline()
863 if not line: break
864 result.append(line)
865 return result
866
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000867 def tell(self):
868 """Return the current file position.
869 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000870 if self.closed:
871 raise ValueError("I/O operation on closed file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000872
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000873 return self.position
874
875 def seek(self, pos, whence=os.SEEK_SET):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000876 """Seek to a position in the file.
877 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000878 if self.closed:
879 raise ValueError("I/O operation on closed file")
880
881 if whence == os.SEEK_SET:
882 self.position = min(max(pos, 0), self.size)
883 elif whence == os.SEEK_CUR:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000884 if pos < 0:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000885 self.position = max(self.position + pos, 0)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000886 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000887 self.position = min(self.position + pos, self.size)
888 elif whence == os.SEEK_END:
889 self.position = max(min(self.size + pos, self.size), 0)
890 else:
891 raise ValueError("Invalid argument")
892
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000893 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000894 self.fileobj.seek(self.position)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000895
896 def close(self):
897 """Close the file object.
898 """
899 self.closed = True
Martin v. Löwisdf241532005-03-03 08:17:42 +0000900
901 def __iter__(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000902 """Get an iterator over the file's lines.
Martin v. Löwisdf241532005-03-03 08:17:42 +0000903 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000904 while True:
905 line = self.readline()
906 if not line:
907 break
908 yield line
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000909#class ExFileObject
910
911#------------------
912# Exported Classes
913#------------------
914class TarInfo(object):
915 """Informational class which holds the details about an
916 archive member given by a tar header block.
917 TarInfo objects are returned by TarFile.getmember(),
918 TarFile.getmembers() and TarFile.gettarinfo() and are
919 usually created internally.
920 """
921
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000922 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
923 "chksum", "type", "linkname", "uname", "gname",
924 "devmajor", "devminor",
925 "offset", "offset_data", "pax_headers", "sparse",
926 "tarfile", "_sparse_structs", "_link_target")
927
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000928 def __init__(self, name=""):
929 """Construct a TarInfo object. name is the optional name
930 of the member.
931 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000932 self.name = name # member name
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000933 self.mode = 0o644 # file permissions
Thomas Wouters477c8d52006-05-27 19:21:47 +0000934 self.uid = 0 # user id
935 self.gid = 0 # group id
936 self.size = 0 # file size
937 self.mtime = 0 # modification time
938 self.chksum = 0 # header checksum
939 self.type = REGTYPE # member type
940 self.linkname = "" # link name
Lars Gustäbel331b8002010-10-04 15:18:47 +0000941 self.uname = "" # user name
942 self.gname = "" # group name
Thomas Wouters477c8d52006-05-27 19:21:47 +0000943 self.devmajor = 0 # device major number
944 self.devminor = 0 # device minor number
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000945
Thomas Wouters477c8d52006-05-27 19:21:47 +0000946 self.offset = 0 # the tar header starts here
947 self.offset_data = 0 # the file's data starts here
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000948
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000949 self.sparse = None # sparse member information
Guido van Rossumd8faa362007-04-27 19:54:29 +0000950 self.pax_headers = {} # pax header information
951
952 # In pax headers the "name" and "linkname" field are called
953 # "path" and "linkpath".
954 def _getpath(self):
955 return self.name
956 def _setpath(self, name):
957 self.name = name
958 path = property(_getpath, _setpath)
959
960 def _getlinkpath(self):
961 return self.linkname
962 def _setlinkpath(self, linkname):
963 self.linkname = linkname
964 linkpath = property(_getlinkpath, _setlinkpath)
965
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000966 def __repr__(self):
967 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
968
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000969 def get_info(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000970 """Return the TarInfo's attributes as a dictionary.
971 """
972 info = {
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000973 "name": self.name,
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000974 "mode": self.mode & 0o7777,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000975 "uid": self.uid,
976 "gid": self.gid,
977 "size": self.size,
978 "mtime": self.mtime,
979 "chksum": self.chksum,
980 "type": self.type,
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000981 "linkname": self.linkname,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000982 "uname": self.uname,
983 "gname": self.gname,
984 "devmajor": self.devmajor,
985 "devminor": self.devminor
986 }
987
988 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
989 info["name"] += "/"
990
991 return info
992
Victor Stinnerde629d42010-05-05 21:43:57 +0000993 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000994 """Return a tar header as a string of 512 byte blocks.
995 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000996 info = self.get_info()
Guido van Rossume7ba4952007-06-06 23:52:48 +0000997
Guido van Rossumd8faa362007-04-27 19:54:29 +0000998 if format == USTAR_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000999 return self.create_ustar_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001000 elif format == GNU_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001001 return self.create_gnu_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001002 elif format == PAX_FORMAT:
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001003 return self.create_pax_header(info, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001004 else:
1005 raise ValueError("invalid format")
1006
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001007 def create_ustar_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001008 """Return the object as a ustar header block.
1009 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001010 info["magic"] = POSIX_MAGIC
1011
1012 if len(info["linkname"]) > LENGTH_LINK:
1013 raise ValueError("linkname is too long")
1014
1015 if len(info["name"]) > LENGTH_NAME:
1016 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1017
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001018 return self._create_header(info, USTAR_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001019
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001020 def create_gnu_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001021 """Return the object as a GNU header block sequence.
1022 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001023 info["magic"] = GNU_MAGIC
1024
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001025 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +00001026 if len(info["linkname"]) > LENGTH_LINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001027 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001028
1029 if len(info["name"]) > LENGTH_NAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001030 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001031
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001032 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001033
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001034 def create_pax_header(self, info, encoding):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001035 """Return the object as a ustar header block. If it cannot be
1036 represented this way, prepend a pax extended header sequence
1037 with supplement information.
1038 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001039 info["magic"] = POSIX_MAGIC
1040 pax_headers = self.pax_headers.copy()
1041
1042 # Test string fields for values that exceed the field length or cannot
1043 # be represented in ASCII encoding.
1044 for name, hname, length in (
1045 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
1046 ("uname", "uname", 32), ("gname", "gname", 32)):
1047
Guido van Rossume7ba4952007-06-06 23:52:48 +00001048 if hname in pax_headers:
1049 # The pax header has priority.
1050 continue
1051
Guido van Rossumd8faa362007-04-27 19:54:29 +00001052 # Try to encode the string as ASCII.
1053 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001054 info[name].encode("ascii", "strict")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001055 except UnicodeEncodeError:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001056 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001057 continue
1058
Guido van Rossume7ba4952007-06-06 23:52:48 +00001059 if len(info[name]) > length:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001060 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001061
1062 # Test number fields for values that exceed the field limit or values
1063 # that like to be stored as float.
1064 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001065 if name in pax_headers:
1066 # The pax header has priority. Avoid overflow.
1067 info[name] = 0
1068 continue
1069
Guido van Rossumd8faa362007-04-27 19:54:29 +00001070 val = info[name]
1071 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001072 pax_headers[name] = str(val)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001073 info[name] = 0
1074
Guido van Rossume7ba4952007-06-06 23:52:48 +00001075 # Create a pax extended header if necessary.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001076 if pax_headers:
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001077 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001078 else:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001079 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +00001080
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001081 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001082
1083 @classmethod
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001084 def create_pax_global_header(cls, pax_headers):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001085 """Return the object as a pax global header block sequence.
1086 """
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001087 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001088
1089 def _posix_split_name(self, name):
1090 """Split a name longer than 100 chars into a prefix
1091 and a name part.
1092 """
1093 prefix = name[:LENGTH_PREFIX + 1]
1094 while prefix and prefix[-1] != "/":
1095 prefix = prefix[:-1]
1096
1097 name = name[len(prefix):]
1098 prefix = prefix[:-1]
1099
1100 if not prefix or len(name) > LENGTH_NAME:
1101 raise ValueError("name is too long")
1102 return prefix, name
1103
1104 @staticmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001105 def _create_header(info, format, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001106 """Return a header block. info is a dictionary with file
1107 information, format must be one of the *_FORMAT constants.
1108 """
1109 parts = [
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001110 stn(info.get("name", ""), 100, encoding, errors),
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001111 itn(info.get("mode", 0) & 0o7777, 8, format),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001112 itn(info.get("uid", 0), 8, format),
1113 itn(info.get("gid", 0), 8, format),
1114 itn(info.get("size", 0), 12, format),
1115 itn(info.get("mtime", 0), 12, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001116 b" ", # checksum field
Guido van Rossumd8faa362007-04-27 19:54:29 +00001117 info.get("type", REGTYPE),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001118 stn(info.get("linkname", ""), 100, encoding, errors),
1119 info.get("magic", POSIX_MAGIC),
Lars Gustäbel331b8002010-10-04 15:18:47 +00001120 stn(info.get("uname", ""), 32, encoding, errors),
1121 stn(info.get("gname", ""), 32, encoding, errors),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001122 itn(info.get("devmajor", 0), 8, format),
1123 itn(info.get("devminor", 0), 8, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001124 stn(info.get("prefix", ""), 155, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001125 ]
1126
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001127 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001128 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
Lars Gustäbela280ca752007-08-28 07:34:33 +00001129 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001130 return buf
1131
1132 @staticmethod
1133 def _create_payload(payload):
1134 """Return the string payload filled with zero bytes
1135 up to the next 512 byte border.
1136 """
1137 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1138 if remainder > 0:
1139 payload += (BLOCKSIZE - remainder) * NUL
1140 return payload
1141
1142 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001143 def _create_gnu_long_header(cls, name, type, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001144 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1145 for name.
1146 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001147 name = name.encode(encoding, errors) + NUL
Guido van Rossumd8faa362007-04-27 19:54:29 +00001148
1149 info = {}
1150 info["name"] = "././@LongLink"
1151 info["type"] = type
1152 info["size"] = len(name)
1153 info["magic"] = GNU_MAGIC
1154
1155 # create extended header + name blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001156 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001157 cls._create_payload(name)
1158
1159 @classmethod
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001160 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1161 """Return a POSIX.1-2008 extended or global header sequence
Guido van Rossumd8faa362007-04-27 19:54:29 +00001162 that contains a list of keyword, value pairs. The values
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001163 must be strings.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001164 """
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001165 # Check if one of the fields contains surrogate characters and thereby
1166 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1167 binary = False
1168 for keyword, value in pax_headers.items():
1169 try:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001170 value.encode("utf-8", "strict")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001171 except UnicodeEncodeError:
1172 binary = True
1173 break
1174
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001175 records = b""
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001176 if binary:
1177 # Put the hdrcharset field at the beginning of the header.
1178 records += b"21 hdrcharset=BINARY\n"
1179
Guido van Rossumd8faa362007-04-27 19:54:29 +00001180 for keyword, value in pax_headers.items():
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001181 keyword = keyword.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001182 if binary:
1183 # Try to restore the original byte representation of `value'.
1184 # Needless to say, that the encoding must match the string.
1185 value = value.encode(encoding, "surrogateescape")
1186 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001187 value = value.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001188
Guido van Rossumd8faa362007-04-27 19:54:29 +00001189 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1190 n = p = 0
1191 while True:
1192 n = l + len(str(p))
1193 if n == p:
1194 break
1195 p = n
Lars Gustäbela280ca752007-08-28 07:34:33 +00001196 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
Guido van Rossumd8faa362007-04-27 19:54:29 +00001197
1198 # We use a hardcoded "././@PaxHeader" name like star does
1199 # instead of the one that POSIX recommends.
1200 info = {}
1201 info["name"] = "././@PaxHeader"
1202 info["type"] = type
1203 info["size"] = len(records)
1204 info["magic"] = POSIX_MAGIC
1205
1206 # Create pax header + record blocks.
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001207 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001208 cls._create_payload(records)
1209
Guido van Rossum75b64e62005-01-16 00:16:11 +00001210 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001211 def frombuf(cls, buf, encoding, errors):
1212 """Construct a TarInfo object from a 512 byte bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001213 """
Lars Gustäbel9520a432009-11-22 18:48:49 +00001214 if len(buf) == 0:
1215 raise EmptyHeaderError("empty header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001216 if len(buf) != BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001217 raise TruncatedHeaderError("truncated header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001218 if buf.count(NUL) == BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001219 raise EOFHeaderError("end of file header")
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001220
1221 chksum = nti(buf[148:156])
1222 if chksum not in calc_chksums(buf):
Lars Gustäbel9520a432009-11-22 18:48:49 +00001223 raise InvalidHeaderError("bad checksum")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001224
Guido van Rossumd8faa362007-04-27 19:54:29 +00001225 obj = cls()
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001226 obj.name = nts(buf[0:100], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001227 obj.mode = nti(buf[100:108])
1228 obj.uid = nti(buf[108:116])
1229 obj.gid = nti(buf[116:124])
1230 obj.size = nti(buf[124:136])
1231 obj.mtime = nti(buf[136:148])
1232 obj.chksum = chksum
1233 obj.type = buf[156:157]
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001234 obj.linkname = nts(buf[157:257], encoding, errors)
1235 obj.uname = nts(buf[265:297], encoding, errors)
1236 obj.gname = nts(buf[297:329], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001237 obj.devmajor = nti(buf[329:337])
1238 obj.devminor = nti(buf[337:345])
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001239 prefix = nts(buf[345:500], encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001240
Guido van Rossumd8faa362007-04-27 19:54:29 +00001241 # Old V7 tar format represents a directory as a regular
1242 # file with a trailing slash.
1243 if obj.type == AREGTYPE and obj.name.endswith("/"):
1244 obj.type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001245
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001246 # The old GNU sparse format occupies some of the unused
1247 # space in the buffer for up to 4 sparse structures.
1248 # Save the them for later processing in _proc_sparse().
1249 if obj.type == GNUTYPE_SPARSE:
1250 pos = 386
1251 structs = []
1252 for i in range(4):
1253 try:
1254 offset = nti(buf[pos:pos + 12])
1255 numbytes = nti(buf[pos + 12:pos + 24])
1256 except ValueError:
1257 break
1258 structs.append((offset, numbytes))
1259 pos += 24
1260 isextended = bool(buf[482])
1261 origsize = nti(buf[483:495])
1262 obj._sparse_structs = (structs, isextended, origsize)
1263
Guido van Rossumd8faa362007-04-27 19:54:29 +00001264 # Remove redundant slashes from directories.
1265 if obj.isdir():
1266 obj.name = obj.name.rstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001267
Guido van Rossumd8faa362007-04-27 19:54:29 +00001268 # Reconstruct a ustar longname.
1269 if prefix and obj.type not in GNU_TYPES:
1270 obj.name = prefix + "/" + obj.name
1271 return obj
1272
1273 @classmethod
1274 def fromtarfile(cls, tarfile):
1275 """Return the next TarInfo object from TarFile object
1276 tarfile.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001277 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001278 buf = tarfile.fileobj.read(BLOCKSIZE)
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001279 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001280 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1281 return obj._proc_member(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001282
Guido van Rossumd8faa362007-04-27 19:54:29 +00001283 #--------------------------------------------------------------------------
1284 # The following are methods that are called depending on the type of a
1285 # member. The entry point is _proc_member() which can be overridden in a
1286 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1287 # implement the following
1288 # operations:
1289 # 1. Set self.offset_data to the position where the data blocks begin,
1290 # if there is data that follows.
1291 # 2. Set tarfile.offset to the position where the next member's header will
1292 # begin.
1293 # 3. Return self or another valid TarInfo object.
1294 def _proc_member(self, tarfile):
1295 """Choose the right processing method depending on
1296 the type and call it.
Thomas Wouters89f507f2006-12-13 04:49:30 +00001297 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001298 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1299 return self._proc_gnulong(tarfile)
1300 elif self.type == GNUTYPE_SPARSE:
1301 return self._proc_sparse(tarfile)
1302 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1303 return self._proc_pax(tarfile)
1304 else:
1305 return self._proc_builtin(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001306
Guido van Rossumd8faa362007-04-27 19:54:29 +00001307 def _proc_builtin(self, tarfile):
1308 """Process a builtin type or an unknown type which
1309 will be treated as a regular file.
1310 """
1311 self.offset_data = tarfile.fileobj.tell()
1312 offset = self.offset_data
1313 if self.isreg() or self.type not in SUPPORTED_TYPES:
1314 # Skip the following data blocks.
1315 offset += self._block(self.size)
1316 tarfile.offset = offset
Thomas Wouters89f507f2006-12-13 04:49:30 +00001317
Guido van Rossume7ba4952007-06-06 23:52:48 +00001318 # Patch the TarInfo object with saved global
Guido van Rossumd8faa362007-04-27 19:54:29 +00001319 # header information.
Guido van Rossume7ba4952007-06-06 23:52:48 +00001320 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001321
1322 return self
1323
1324 def _proc_gnulong(self, tarfile):
1325 """Process the blocks that hold a GNU longname
1326 or longlink member.
1327 """
1328 buf = tarfile.fileobj.read(self._block(self.size))
1329
1330 # Fetch the next header and process it.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001331 try:
1332 next = self.fromtarfile(tarfile)
1333 except HeaderError:
1334 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001335
1336 # Patch the TarInfo object from the next header with
1337 # the longname information.
1338 next.offset = self.offset
1339 if self.type == GNUTYPE_LONGNAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001340 next.name = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001341 elif self.type == GNUTYPE_LONGLINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001342 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001343
1344 return next
1345
1346 def _proc_sparse(self, tarfile):
1347 """Process a GNU sparse header plus extra headers.
1348 """
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001349 # We already collected some sparse structures in frombuf().
1350 structs, isextended, origsize = self._sparse_structs
1351 del self._sparse_structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001352
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001353 # Collect sparse structures from extended header blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001354 while isextended:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001355 buf = tarfile.fileobj.read(BLOCKSIZE)
1356 pos = 0
Guido van Rossum805365e2007-05-07 22:24:25 +00001357 for i in range(21):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001358 try:
1359 offset = nti(buf[pos:pos + 12])
1360 numbytes = nti(buf[pos + 12:pos + 24])
1361 except ValueError:
1362 break
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001363 if offset and numbytes:
1364 structs.append((offset, numbytes))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001365 pos += 24
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001366 isextended = bool(buf[504])
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001367 self.sparse = structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001368
1369 self.offset_data = tarfile.fileobj.tell()
1370 tarfile.offset = self.offset_data + self._block(self.size)
1371 self.size = origsize
Guido van Rossumd8faa362007-04-27 19:54:29 +00001372 return self
1373
1374 def _proc_pax(self, tarfile):
1375 """Process an extended or global header as described in
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001376 POSIX.1-2008.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001377 """
1378 # Read the header information.
1379 buf = tarfile.fileobj.read(self._block(self.size))
1380
1381 # A pax header stores supplemental information for either
1382 # the following file (extended) or all following files
1383 # (global).
1384 if self.type == XGLTYPE:
1385 pax_headers = tarfile.pax_headers
1386 else:
1387 pax_headers = tarfile.pax_headers.copy()
1388
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001389 # Check if the pax header contains a hdrcharset field. This tells us
1390 # the encoding of the path, linkpath, uname and gname fields. Normally,
1391 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1392 # implementations are allowed to store them as raw binary strings if
1393 # the translation to UTF-8 fails.
1394 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1395 if match is not None:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001396 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001397
1398 # For the time being, we don't care about anything other than "BINARY".
1399 # The only other value that is currently allowed by the standard is
1400 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1401 hdrcharset = pax_headers.get("hdrcharset")
1402 if hdrcharset == "BINARY":
1403 encoding = tarfile.encoding
1404 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001405 encoding = "utf-8"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001406
Guido van Rossumd8faa362007-04-27 19:54:29 +00001407 # Parse pax header information. A record looks like that:
1408 # "%d %s=%s\n" % (length, keyword, value). length is the size
1409 # of the complete record including the length field itself and
Guido van Rossume7ba4952007-06-06 23:52:48 +00001410 # the newline. keyword and value are both UTF-8 encoded strings.
Antoine Pitroufd036452008-08-19 17:56:33 +00001411 regex = re.compile(br"(\d+) ([^=]+)=")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001412 pos = 0
1413 while True:
1414 match = regex.match(buf, pos)
1415 if not match:
1416 break
1417
1418 length, keyword = match.groups()
1419 length = int(length)
1420 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1421
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001422 # Normally, we could just use "utf-8" as the encoding and "strict"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001423 # as the error handler, but we better not take the risk. For
1424 # example, GNU tar <= 1.23 is known to store filenames it cannot
1425 # translate to UTF-8 as raw strings (unfortunately without a
1426 # hdrcharset=BINARY header).
1427 # We first try the strict standard encoding, and if that fails we
1428 # fall back on the user's encoding and error handler.
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001429 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001430 tarfile.errors)
1431 if keyword in PAX_NAME_FIELDS:
1432 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1433 tarfile.errors)
1434 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001435 value = self._decode_pax_field(value, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001436 tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001437
1438 pax_headers[keyword] = value
1439 pos += length
1440
Guido van Rossume7ba4952007-06-06 23:52:48 +00001441 # Fetch the next header.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001442 try:
1443 next = self.fromtarfile(tarfile)
1444 except HeaderError:
1445 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001446
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001447 # Process GNU sparse information.
1448 if "GNU.sparse.map" in pax_headers:
1449 # GNU extended sparse format version 0.1.
1450 self._proc_gnusparse_01(next, pax_headers)
1451
1452 elif "GNU.sparse.size" in pax_headers:
1453 # GNU extended sparse format version 0.0.
1454 self._proc_gnusparse_00(next, pax_headers, buf)
1455
1456 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1457 # GNU extended sparse format version 1.0.
1458 self._proc_gnusparse_10(next, pax_headers, tarfile)
1459
Guido van Rossume7ba4952007-06-06 23:52:48 +00001460 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001461 # Patch the TarInfo object with the extended header info.
1462 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1463 next.offset = self.offset
1464
1465 if "size" in pax_headers:
1466 # If the extended header replaces the size field,
1467 # we need to recalculate the offset where the next
1468 # header starts.
1469 offset = next.offset_data
1470 if next.isreg() or next.type not in SUPPORTED_TYPES:
1471 offset += next._block(next.size)
1472 tarfile.offset = offset
1473
1474 return next
1475
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001476 def _proc_gnusparse_00(self, next, pax_headers, buf):
1477 """Process a GNU tar extended sparse header, version 0.0.
1478 """
1479 offsets = []
1480 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1481 offsets.append(int(match.group(1)))
1482 numbytes = []
1483 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1484 numbytes.append(int(match.group(1)))
1485 next.sparse = list(zip(offsets, numbytes))
1486
1487 def _proc_gnusparse_01(self, next, pax_headers):
1488 """Process a GNU tar extended sparse header, version 0.1.
1489 """
1490 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1491 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1492
1493 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1494 """Process a GNU tar extended sparse header, version 1.0.
1495 """
1496 fields = None
1497 sparse = []
1498 buf = tarfile.fileobj.read(BLOCKSIZE)
1499 fields, buf = buf.split(b"\n", 1)
1500 fields = int(fields)
1501 while len(sparse) < fields * 2:
1502 if b"\n" not in buf:
1503 buf += tarfile.fileobj.read(BLOCKSIZE)
1504 number, buf = buf.split(b"\n", 1)
1505 sparse.append(int(number))
1506 next.offset_data = tarfile.fileobj.tell()
1507 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1508
Guido van Rossume7ba4952007-06-06 23:52:48 +00001509 def _apply_pax_info(self, pax_headers, encoding, errors):
1510 """Replace fields with supplemental information from a previous
1511 pax extended or global header.
1512 """
1513 for keyword, value in pax_headers.items():
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001514 if keyword == "GNU.sparse.name":
1515 setattr(self, "path", value)
1516 elif keyword == "GNU.sparse.size":
1517 setattr(self, "size", int(value))
1518 elif keyword == "GNU.sparse.realsize":
1519 setattr(self, "size", int(value))
1520 elif keyword in PAX_FIELDS:
1521 if keyword in PAX_NUMBER_FIELDS:
1522 try:
1523 value = PAX_NUMBER_FIELDS[keyword](value)
1524 except ValueError:
1525 value = 0
1526 if keyword == "path":
1527 value = value.rstrip("/")
1528 setattr(self, keyword, value)
Guido van Rossume7ba4952007-06-06 23:52:48 +00001529
1530 self.pax_headers = pax_headers.copy()
Guido van Rossumd8faa362007-04-27 19:54:29 +00001531
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001532 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1533 """Decode a single field from a pax record.
1534 """
1535 try:
1536 return value.decode(encoding, "strict")
1537 except UnicodeDecodeError:
1538 return value.decode(fallback_encoding, fallback_errors)
1539
Guido van Rossumd8faa362007-04-27 19:54:29 +00001540 def _block(self, count):
1541 """Round up a byte count by BLOCKSIZE and return it,
1542 e.g. _block(834) => 1024.
1543 """
1544 blocks, remainder = divmod(count, BLOCKSIZE)
1545 if remainder:
1546 blocks += 1
1547 return blocks * BLOCKSIZE
Thomas Wouters89f507f2006-12-13 04:49:30 +00001548
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001549 def isreg(self):
1550 return self.type in REGULAR_TYPES
1551 def isfile(self):
1552 return self.isreg()
1553 def isdir(self):
1554 return self.type == DIRTYPE
1555 def issym(self):
1556 return self.type == SYMTYPE
1557 def islnk(self):
1558 return self.type == LNKTYPE
1559 def ischr(self):
1560 return self.type == CHRTYPE
1561 def isblk(self):
1562 return self.type == BLKTYPE
1563 def isfifo(self):
1564 return self.type == FIFOTYPE
1565 def issparse(self):
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001566 return self.sparse is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001567 def isdev(self):
1568 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1569# class TarInfo
1570
1571class TarFile(object):
1572 """The TarFile Class provides an interface to tar archives.
1573 """
1574
1575 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1576
1577 dereference = False # If true, add content of linked file to the
1578 # tar file, else the link.
1579
1580 ignore_zeros = False # If true, skips empty or invalid blocks and
1581 # continues processing.
1582
Lars Gustäbel365aff32009-12-13 11:42:29 +00001583 errorlevel = 1 # If 0, fatal errors only appear in debug
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001584 # messages (if debug >= 0). If > 0, errors
1585 # are passed to the caller as exceptions.
1586
Guido van Rossumd8faa362007-04-27 19:54:29 +00001587 format = DEFAULT_FORMAT # The format to use when creating an archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001588
Guido van Rossume7ba4952007-06-06 23:52:48 +00001589 encoding = ENCODING # Encoding for 8-bit character strings.
1590
1591 errors = None # Error handler for unicode conversion.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001592
Guido van Rossumd8faa362007-04-27 19:54:29 +00001593 tarinfo = TarInfo # The default TarInfo class to use.
1594
1595 fileobject = ExFileObject # The default ExFileObject class to use.
1596
1597 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1598 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
Victor Stinnerde629d42010-05-05 21:43:57 +00001599 errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001600 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1601 read from an existing archive, 'a' to append data to an existing
1602 file or 'w' to create a new file overwriting an existing one. `mode'
1603 defaults to 'r'.
1604 If `fileobj' is given, it is used for reading or writing data. If it
1605 can be determined, `mode' is overridden by `fileobj's mode.
1606 `fileobj' is not closed, when TarFile is closed.
1607 """
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001608 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001609 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001610 self.mode = mode
1611 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001612
1613 if not fileobj:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001614 if self.mode == "a" and not os.path.exists(name):
Thomas Wouterscf297e42007-02-23 15:07:44 +00001615 # Create nonexistent files in append mode.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001616 self.mode = "w"
1617 self._mode = "wb"
Guido van Rossume7ba4952007-06-06 23:52:48 +00001618 fileobj = bltn_open(name, self._mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001619 self._extfileobj = False
1620 else:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001621 if name is None and hasattr(fileobj, "name"):
1622 name = fileobj.name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001623 if hasattr(fileobj, "mode"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001624 self._mode = fileobj.mode
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001625 self._extfileobj = True
Thomas Woutersed03b412007-08-28 21:37:11 +00001626 self.name = os.path.abspath(name) if name else None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001627 self.fileobj = fileobj
1628
Guido van Rossumd8faa362007-04-27 19:54:29 +00001629 # Init attributes.
1630 if format is not None:
1631 self.format = format
1632 if tarinfo is not None:
1633 self.tarinfo = tarinfo
1634 if dereference is not None:
1635 self.dereference = dereference
1636 if ignore_zeros is not None:
1637 self.ignore_zeros = ignore_zeros
1638 if encoding is not None:
1639 self.encoding = encoding
Victor Stinnerde629d42010-05-05 21:43:57 +00001640 self.errors = errors
Guido van Rossume7ba4952007-06-06 23:52:48 +00001641
1642 if pax_headers is not None and self.format == PAX_FORMAT:
1643 self.pax_headers = pax_headers
1644 else:
1645 self.pax_headers = {}
1646
Guido van Rossumd8faa362007-04-27 19:54:29 +00001647 if debug is not None:
1648 self.debug = debug
1649 if errorlevel is not None:
1650 self.errorlevel = errorlevel
1651
1652 # Init datastructures.
Thomas Wouters477c8d52006-05-27 19:21:47 +00001653 self.closed = False
1654 self.members = [] # list of members as TarInfo objects
1655 self._loaded = False # flag if all members have been read
Christian Heimesd8654cf2007-12-02 15:22:16 +00001656 self.offset = self.fileobj.tell()
1657 # current position in the archive file
Thomas Wouters477c8d52006-05-27 19:21:47 +00001658 self.inodes = {} # dictionary caching the inodes of
1659 # archive members already added
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001660
Lars Gustäbel7b465392009-11-18 20:29:25 +00001661 try:
1662 if self.mode == "r":
1663 self.firstmember = None
1664 self.firstmember = self.next()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001665
Lars Gustäbel7b465392009-11-18 20:29:25 +00001666 if self.mode == "a":
1667 # Move to the end of the archive,
1668 # before the first empty block.
Lars Gustäbel7b465392009-11-18 20:29:25 +00001669 while True:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001670 self.fileobj.seek(self.offset)
1671 try:
1672 tarinfo = self.tarinfo.fromtarfile(self)
1673 self.members.append(tarinfo)
1674 except EOFHeaderError:
1675 self.fileobj.seek(self.offset)
Lars Gustäbel7b465392009-11-18 20:29:25 +00001676 break
Lars Gustäbel9520a432009-11-22 18:48:49 +00001677 except HeaderError as e:
1678 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001679
Lars Gustäbel7b465392009-11-18 20:29:25 +00001680 if self.mode in "aw":
1681 self._loaded = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001682
Lars Gustäbel7b465392009-11-18 20:29:25 +00001683 if self.pax_headers:
1684 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1685 self.fileobj.write(buf)
1686 self.offset += len(buf)
1687 except:
1688 if not self._extfileobj:
1689 self.fileobj.close()
1690 self.closed = True
1691 raise
Guido van Rossumd8faa362007-04-27 19:54:29 +00001692
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001693 #--------------------------------------------------------------------------
1694 # Below are the classmethods which act as alternate constructors to the
1695 # TarFile class. The open() method is the only one that is needed for
1696 # public use; it is the "super"-constructor and is able to select an
1697 # adequate "sub"-constructor for a particular compression using the mapping
1698 # from OPEN_METH.
1699 #
1700 # This concept allows one to subclass TarFile without losing the comfort of
1701 # the super-constructor. A sub-constructor is registered and made available
1702 # by adding it to the mapping in OPEN_METH.
1703
Guido van Rossum75b64e62005-01-16 00:16:11 +00001704 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001705 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001706 """Open a tar archive for reading, writing or appending. Return
1707 an appropriate TarFile class.
1708
1709 mode:
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001710 'r' or 'r:*' open for reading with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001711 'r:' open for reading exclusively uncompressed
1712 'r:gz' open for reading with gzip compression
1713 'r:bz2' open for reading with bzip2 compression
Thomas Wouterscf297e42007-02-23 15:07:44 +00001714 'a' or 'a:' open for appending, creating the file if necessary
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001715 'w' or 'w:' open for writing without compression
1716 'w:gz' open for writing with gzip compression
1717 'w:bz2' open for writing with bzip2 compression
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001718
1719 'r|*' open a stream of tar blocks with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001720 'r|' open an uncompressed stream of tar blocks for reading
1721 'r|gz' open a gzip compressed stream of tar blocks
1722 'r|bz2' open a bzip2 compressed stream of tar blocks
1723 'w|' open an uncompressed stream for writing
1724 'w|gz' open a gzip compressed stream for writing
1725 'w|bz2' open a bzip2 compressed stream for writing
1726 """
1727
1728 if not name and not fileobj:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001729 raise ValueError("nothing to open")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001730
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001731 if mode in ("r", "r:*"):
1732 # Find out which *open() is appropriate for opening the file.
1733 for comptype in cls.OPEN_METH:
1734 func = getattr(cls, cls.OPEN_METH[comptype])
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001735 if fileobj is not None:
1736 saved_pos = fileobj.tell()
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001737 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001738 return func(name, "r", fileobj, **kwargs)
1739 except (ReadError, CompressionError) as e:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001740 if fileobj is not None:
1741 fileobj.seek(saved_pos)
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001742 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001743 raise ReadError("file could not be opened successfully")
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001744
1745 elif ":" in mode:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001746 filemode, comptype = mode.split(":", 1)
1747 filemode = filemode or "r"
1748 comptype = comptype or "tar"
1749
1750 # Select the *open() function according to
1751 # given compression.
1752 if comptype in cls.OPEN_METH:
1753 func = getattr(cls, cls.OPEN_METH[comptype])
1754 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001755 raise CompressionError("unknown compression type %r" % comptype)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001756 return func(name, filemode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001757
1758 elif "|" in mode:
1759 filemode, comptype = mode.split("|", 1)
1760 filemode = filemode or "r"
1761 comptype = comptype or "tar"
1762
1763 if filemode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001764 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001765
Antoine Pitrou605c2932010-09-23 20:15:14 +00001766 stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1767 try:
1768 t = cls(name, filemode, stream, **kwargs)
1769 except:
1770 stream.close()
1771 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001772 t._extfileobj = False
1773 return t
1774
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001775 elif mode in "aw":
Guido van Rossumd8faa362007-04-27 19:54:29 +00001776 return cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001777
Thomas Wouters477c8d52006-05-27 19:21:47 +00001778 raise ValueError("undiscernible mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001779
Guido van Rossum75b64e62005-01-16 00:16:11 +00001780 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001781 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001782 """Open uncompressed tar archive name for reading or writing.
1783 """
1784 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001785 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001786 return cls(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001787
Guido van Rossum75b64e62005-01-16 00:16:11 +00001788 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001789 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001790 """Open gzip compressed tar archive name for reading or writing.
1791 Appending is not allowed.
1792 """
1793 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001794 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001795
1796 try:
1797 import gzip
Neal Norwitz4ec68242003-04-11 03:05:56 +00001798 gzip.GzipFile
1799 except (ImportError, AttributeError):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001800 raise CompressionError("gzip module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001801
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001802 extfileobj = fileobj is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001803 try:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001804 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
1805 t = cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001806 except IOError:
Georg Brandl3abb3722011-08-13 11:48:12 +02001807 if not extfileobj and fileobj is not None:
Antoine Pitrou95f55602010-09-23 18:36:46 +00001808 fileobj.close()
Georg Brandl3abb3722011-08-13 11:48:12 +02001809 if fileobj is None:
1810 raise
Thomas Wouters477c8d52006-05-27 19:21:47 +00001811 raise ReadError("not a gzip file")
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001812 except:
Georg Brandl3abb3722011-08-13 11:48:12 +02001813 if not extfileobj and fileobj is not None:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001814 fileobj.close()
1815 raise
Antoine Pitrou95f55602010-09-23 18:36:46 +00001816 t._extfileobj = extfileobj
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001817 return t
1818
Guido van Rossum75b64e62005-01-16 00:16:11 +00001819 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001820 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001821 """Open bzip2 compressed tar archive name for reading or writing.
1822 Appending is not allowed.
1823 """
1824 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001825 raise ValueError("mode must be 'r' or 'w'.")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001826
1827 try:
1828 import bz2
1829 except ImportError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001830 raise CompressionError("bz2 module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001831
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001832 if fileobj is not None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001833 fileobj = _BZ2Proxy(fileobj, mode)
1834 else:
1835 fileobj = bz2.BZ2File(name, mode, compresslevel=compresslevel)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001836
1837 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001838 t = cls.taropen(name, mode, fileobj, **kwargs)
Lars Gustäbel9520a432009-11-22 18:48:49 +00001839 except (IOError, EOFError):
Antoine Pitrou95f55602010-09-23 18:36:46 +00001840 fileobj.close()
Thomas Wouters477c8d52006-05-27 19:21:47 +00001841 raise ReadError("not a bzip2 file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001842 t._extfileobj = False
1843 return t
1844
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001845 # All *open() methods are registered here.
1846 OPEN_METH = {
1847 "tar": "taropen", # uncompressed tar
1848 "gz": "gzopen", # gzip compressed tar
1849 "bz2": "bz2open" # bzip2 compressed tar
1850 }
1851
1852 #--------------------------------------------------------------------------
1853 # The public methods which TarFile provides:
1854
1855 def close(self):
1856 """Close the TarFile. In write-mode, two finishing zero blocks are
1857 appended to the archive.
1858 """
1859 if self.closed:
1860 return
1861
Guido van Rossumd8faa362007-04-27 19:54:29 +00001862 if self.mode in "aw":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001863 self.fileobj.write(NUL * (BLOCKSIZE * 2))
1864 self.offset += (BLOCKSIZE * 2)
1865 # fill up the end with zero-blocks
1866 # (like option -b20 for tar does)
1867 blocks, remainder = divmod(self.offset, RECORDSIZE)
1868 if remainder > 0:
1869 self.fileobj.write(NUL * (RECORDSIZE - remainder))
1870
1871 if not self._extfileobj:
1872 self.fileobj.close()
1873 self.closed = True
1874
1875 def getmember(self, name):
1876 """Return a TarInfo object for member `name'. If `name' can not be
1877 found in the archive, KeyError is raised. If a member occurs more
Mark Dickinson934896d2009-02-21 20:59:32 +00001878 than once in the archive, its last occurrence is assumed to be the
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001879 most up-to-date version.
1880 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001881 tarinfo = self._getmember(name)
1882 if tarinfo is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001883 raise KeyError("filename %r not found" % name)
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001884 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001885
1886 def getmembers(self):
1887 """Return the members of the archive as a list of TarInfo objects. The
1888 list has the same order as the members in the archive.
1889 """
1890 self._check()
1891 if not self._loaded: # if we want to obtain a list of
1892 self._load() # all members, we first have to
1893 # scan the whole archive.
1894 return self.members
1895
1896 def getnames(self):
1897 """Return the members of the archive as a list of their names. It has
1898 the same order as the list returned by getmembers().
1899 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001900 return [tarinfo.name for tarinfo in self.getmembers()]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001901
1902 def gettarinfo(self, name=None, arcname=None, fileobj=None):
1903 """Create a TarInfo object for either the file `name' or the file
1904 object `fileobj' (using os.fstat on its file descriptor). You can
1905 modify some of the TarInfo's attributes before you add it using
1906 addfile(). If given, `arcname' specifies an alternative name for the
1907 file in the archive.
1908 """
1909 self._check("aw")
1910
1911 # When fileobj is given, replace name by
1912 # fileobj's real name.
1913 if fileobj is not None:
1914 name = fileobj.name
1915
1916 # Building the name of the member in the archive.
1917 # Backward slashes are converted to forward slashes,
1918 # Absolute paths are turned to relative paths.
1919 if arcname is None:
1920 arcname = name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001921 drv, arcname = os.path.splitdrive(arcname)
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00001922 arcname = arcname.replace(os.sep, "/")
1923 arcname = arcname.lstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001924
1925 # Now, fill the TarInfo object with
1926 # information specific for the file.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001927 tarinfo = self.tarinfo()
1928 tarinfo.tarfile = self
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001929
1930 # Use os.stat or os.lstat, depending on platform
1931 # and if symlinks shall be resolved.
1932 if fileobj is None:
1933 if hasattr(os, "lstat") and not self.dereference:
1934 statres = os.lstat(name)
1935 else:
1936 statres = os.stat(name)
1937 else:
1938 statres = os.fstat(fileobj.fileno())
1939 linkname = ""
1940
1941 stmd = statres.st_mode
1942 if stat.S_ISREG(stmd):
1943 inode = (statres.st_ino, statres.st_dev)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001944 if not self.dereference and statres.st_nlink > 1 and \
1945 inode in self.inodes and arcname != self.inodes[inode]:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001946 # Is it a hardlink to an already
1947 # archived file?
1948 type = LNKTYPE
1949 linkname = self.inodes[inode]
1950 else:
1951 # The inode is added only if its valid.
1952 # For win32 it is always 0.
1953 type = REGTYPE
1954 if inode[0]:
1955 self.inodes[inode] = arcname
1956 elif stat.S_ISDIR(stmd):
1957 type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001958 elif stat.S_ISFIFO(stmd):
1959 type = FIFOTYPE
1960 elif stat.S_ISLNK(stmd):
1961 type = SYMTYPE
1962 linkname = os.readlink(name)
1963 elif stat.S_ISCHR(stmd):
1964 type = CHRTYPE
1965 elif stat.S_ISBLK(stmd):
1966 type = BLKTYPE
1967 else:
1968 return None
1969
1970 # Fill the TarInfo object with all
1971 # information we can get.
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001972 tarinfo.name = arcname
1973 tarinfo.mode = stmd
1974 tarinfo.uid = statres.st_uid
1975 tarinfo.gid = statres.st_gid
Lars Gustäbel2470ff12010-06-03 10:11:52 +00001976 if type == REGTYPE:
Martin v. Löwis61d77e02004-08-20 06:35:46 +00001977 tarinfo.size = statres.st_size
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001978 else:
Guido van Rossume2a383d2007-01-15 16:59:06 +00001979 tarinfo.size = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001980 tarinfo.mtime = statres.st_mtime
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001981 tarinfo.type = type
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001982 tarinfo.linkname = linkname
1983 if pwd:
1984 try:
1985 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1986 except KeyError:
1987 pass
1988 if grp:
1989 try:
1990 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1991 except KeyError:
1992 pass
1993
1994 if type in (CHRTYPE, BLKTYPE):
1995 if hasattr(os, "major") and hasattr(os, "minor"):
1996 tarinfo.devmajor = os.major(statres.st_rdev)
1997 tarinfo.devminor = os.minor(statres.st_rdev)
1998 return tarinfo
1999
2000 def list(self, verbose=True):
2001 """Print a table of contents to sys.stdout. If `verbose' is False, only
2002 the names of the members are printed. If it is True, an `ls -l'-like
2003 output is produced.
2004 """
2005 self._check()
2006
2007 for tarinfo in self:
2008 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002009 print(filemode(tarinfo.mode), end=' ')
2010 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2011 tarinfo.gname or tarinfo.gid), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002012 if tarinfo.ischr() or tarinfo.isblk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002013 print("%10s" % ("%d,%d" \
2014 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002015 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002016 print("%10d" % tarinfo.size, end=' ')
2017 print("%d-%02d-%02d %02d:%02d:%02d" \
2018 % time.localtime(tarinfo.mtime)[:6], end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002019
Guido van Rossumd8faa362007-04-27 19:54:29 +00002020 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002021
2022 if verbose:
2023 if tarinfo.issym():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002024 print("->", tarinfo.linkname, end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002025 if tarinfo.islnk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002026 print("link to", tarinfo.linkname, end=' ')
2027 print()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002028
Raymond Hettingera63a3122011-01-26 20:34:14 +00002029 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002030 """Add the file `name' to the archive. `name' may be any type of file
2031 (directory, fifo, symbolic link, etc.). If given, `arcname'
2032 specifies an alternative name for the file in the archive.
2033 Directories are added recursively by default. This can be avoided by
Guido van Rossum486364b2007-06-30 05:01:58 +00002034 setting `recursive' to False. `exclude' is a function that should
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002035 return True for each filename to be excluded. `filter' is a function
2036 that expects a TarInfo object argument and returns the changed
2037 TarInfo object, if it returns None the TarInfo object will be
2038 excluded from the archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002039 """
2040 self._check("aw")
2041
2042 if arcname is None:
2043 arcname = name
2044
Guido van Rossum486364b2007-06-30 05:01:58 +00002045 # Exclude pathnames.
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002046 if exclude is not None:
2047 import warnings
2048 warnings.warn("use the filter argument instead",
2049 DeprecationWarning, 2)
2050 if exclude(name):
2051 self._dbg(2, "tarfile: Excluded %r" % name)
2052 return
Guido van Rossum486364b2007-06-30 05:01:58 +00002053
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002054 # Skip if somebody tries to archive the archive...
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002055 if self.name is not None and os.path.abspath(name) == self.name:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002056 self._dbg(2, "tarfile: Skipped %r" % name)
2057 return
2058
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002059 self._dbg(1, name)
2060
2061 # Create a TarInfo object from the file.
2062 tarinfo = self.gettarinfo(name, arcname)
2063
2064 if tarinfo is None:
2065 self._dbg(1, "tarfile: Unsupported type %r" % name)
2066 return
2067
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002068 # Change or exclude the TarInfo object.
2069 if filter is not None:
2070 tarinfo = filter(tarinfo)
2071 if tarinfo is None:
2072 self._dbg(2, "tarfile: Excluded %r" % name)
2073 return
2074
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002075 # Append the tar header and data to the archive.
2076 if tarinfo.isreg():
Guido van Rossume7ba4952007-06-06 23:52:48 +00002077 f = bltn_open(name, "rb")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002078 self.addfile(tarinfo, f)
2079 f.close()
2080
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002081 elif tarinfo.isdir():
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002082 self.addfile(tarinfo)
2083 if recursive:
2084 for f in os.listdir(name):
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002085 self.add(os.path.join(name, f), os.path.join(arcname, f),
Raymond Hettingera63a3122011-01-26 20:34:14 +00002086 recursive, exclude, filter=filter)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002087
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002088 else:
2089 self.addfile(tarinfo)
2090
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002091 def addfile(self, tarinfo, fileobj=None):
2092 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2093 given, tarinfo.size bytes are read from it and added to the archive.
2094 You can create TarInfo objects using gettarinfo().
2095 On Windows platforms, `fileobj' should always be opened with mode
2096 'rb' to avoid irritation about the file size.
2097 """
2098 self._check("aw")
2099
Thomas Wouters89f507f2006-12-13 04:49:30 +00002100 tarinfo = copy.copy(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002101
Guido van Rossume7ba4952007-06-06 23:52:48 +00002102 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002103 self.fileobj.write(buf)
2104 self.offset += len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002105
2106 # If there's data to follow, append it.
2107 if fileobj is not None:
2108 copyfileobj(fileobj, self.fileobj, tarinfo.size)
2109 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
2110 if remainder > 0:
2111 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2112 blocks += 1
2113 self.offset += blocks * BLOCKSIZE
2114
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002115 self.members.append(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002116
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002117 def extractall(self, path=".", members=None):
2118 """Extract all members from the archive to the current working
2119 directory and set owner, modification time and permissions on
2120 directories afterwards. `path' specifies a different directory
2121 to extract to. `members' is optional and must be a subset of the
2122 list returned by getmembers().
2123 """
2124 directories = []
2125
2126 if members is None:
2127 members = self
2128
2129 for tarinfo in members:
2130 if tarinfo.isdir():
Christian Heimes2202f872008-02-06 14:31:34 +00002131 # Extract directories with a safe mode.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002132 directories.append(tarinfo)
Christian Heimes2202f872008-02-06 14:31:34 +00002133 tarinfo = copy.copy(tarinfo)
2134 tarinfo.mode = 0o700
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002135 # Do not set_attrs directories, as we will do that further down
2136 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002137
2138 # Reverse sort directories.
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +00002139 directories.sort(key=lambda a: a.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002140 directories.reverse()
2141
2142 # Set correct owner, mtime and filemode on directories.
2143 for tarinfo in directories:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002144 dirpath = os.path.join(path, tarinfo.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002145 try:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002146 self.chown(tarinfo, dirpath)
2147 self.utime(tarinfo, dirpath)
2148 self.chmod(tarinfo, dirpath)
Guido van Rossumb940e112007-01-10 16:19:56 +00002149 except ExtractError as e:
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002150 if self.errorlevel > 1:
2151 raise
2152 else:
2153 self._dbg(1, "tarfile: %s" % e)
2154
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002155 def extract(self, member, path="", set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002156 """Extract a member from the archive to the current working directory,
2157 using its full name. Its file information is extracted as accurately
2158 as possible. `member' may be a filename or a TarInfo object. You can
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002159 specify a different directory using `path'. File attributes (owner,
2160 mtime, mode) are set unless `set_attrs' is False.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002161 """
2162 self._check("r")
2163
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002164 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002165 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002166 else:
2167 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002168
Neal Norwitza4f651a2004-07-20 22:07:44 +00002169 # Prepare the link target for makelink().
2170 if tarinfo.islnk():
2171 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2172
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002173 try:
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002174 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2175 set_attrs=set_attrs)
Guido van Rossumb940e112007-01-10 16:19:56 +00002176 except EnvironmentError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002177 if self.errorlevel > 0:
2178 raise
2179 else:
2180 if e.filename is None:
2181 self._dbg(1, "tarfile: %s" % e.strerror)
2182 else:
2183 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
Guido van Rossumb940e112007-01-10 16:19:56 +00002184 except ExtractError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002185 if self.errorlevel > 1:
2186 raise
2187 else:
2188 self._dbg(1, "tarfile: %s" % e)
2189
2190 def extractfile(self, member):
2191 """Extract a member from the archive as a file object. `member' may be
2192 a filename or a TarInfo object. If `member' is a regular file, a
2193 file-like object is returned. If `member' is a link, a file-like
2194 object is constructed from the link's target. If `member' is none of
2195 the above, None is returned.
2196 The file-like object is read-only and provides the following
2197 methods: read(), readline(), readlines(), seek() and tell()
2198 """
2199 self._check("r")
2200
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002201 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002202 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002203 else:
2204 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002205
2206 if tarinfo.isreg():
2207 return self.fileobject(self, tarinfo)
2208
2209 elif tarinfo.type not in SUPPORTED_TYPES:
2210 # If a member's type is unknown, it is treated as a
2211 # regular file.
2212 return self.fileobject(self, tarinfo)
2213
2214 elif tarinfo.islnk() or tarinfo.issym():
2215 if isinstance(self.fileobj, _Stream):
2216 # A small but ugly workaround for the case that someone tries
2217 # to extract a (sym)link as a file-object from a non-seekable
2218 # stream of tar blocks.
Thomas Wouters477c8d52006-05-27 19:21:47 +00002219 raise StreamError("cannot extract (sym)link as file object")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002220 else:
Georg Brandl7eb4b7d2005-07-22 21:49:32 +00002221 # A (sym)link's file object is its target's file object.
Lars Gustäbel1b512722010-06-03 12:45:16 +00002222 return self.extractfile(self._find_link_target(tarinfo))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002223 else:
2224 # If there's no data associated with the member (directory, chrdev,
2225 # blkdev, etc.), return None instead of a file object.
2226 return None
2227
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002228 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002229 """Extract the TarInfo object tarinfo to a physical
2230 file called targetpath.
2231 """
2232 # Fetch the TarInfo object for the given name
2233 # and build the destination pathname, replacing
2234 # forward slashes to platform specific separators.
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002235 targetpath = targetpath.rstrip("/")
2236 targetpath = targetpath.replace("/", os.sep)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002237
2238 # Create all upper directories.
2239 upperdirs = os.path.dirname(targetpath)
2240 if upperdirs and not os.path.exists(upperdirs):
Christian Heimes2202f872008-02-06 14:31:34 +00002241 # Create directories that are not part of the archive with
2242 # default permissions.
Thomas Woutersb2137042007-02-01 18:02:27 +00002243 os.makedirs(upperdirs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002244
2245 if tarinfo.islnk() or tarinfo.issym():
2246 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2247 else:
2248 self._dbg(1, tarinfo.name)
2249
2250 if tarinfo.isreg():
2251 self.makefile(tarinfo, targetpath)
2252 elif tarinfo.isdir():
2253 self.makedir(tarinfo, targetpath)
2254 elif tarinfo.isfifo():
2255 self.makefifo(tarinfo, targetpath)
2256 elif tarinfo.ischr() or tarinfo.isblk():
2257 self.makedev(tarinfo, targetpath)
2258 elif tarinfo.islnk() or tarinfo.issym():
2259 self.makelink(tarinfo, targetpath)
2260 elif tarinfo.type not in SUPPORTED_TYPES:
2261 self.makeunknown(tarinfo, targetpath)
2262 else:
2263 self.makefile(tarinfo, targetpath)
2264
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002265 if set_attrs:
2266 self.chown(tarinfo, targetpath)
2267 if not tarinfo.issym():
2268 self.chmod(tarinfo, targetpath)
2269 self.utime(tarinfo, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002270
2271 #--------------------------------------------------------------------------
2272 # Below are the different file methods. They are called via
2273 # _extract_member() when extract() is called. They can be replaced in a
2274 # subclass to implement other functionality.
2275
2276 def makedir(self, tarinfo, targetpath):
2277 """Make a directory called targetpath.
2278 """
2279 try:
Christian Heimes2202f872008-02-06 14:31:34 +00002280 # Use a safe mode for the directory, the real mode is set
2281 # later in _extract_member().
2282 os.mkdir(targetpath, 0o700)
Florent Xicluna68f71a32011-10-28 16:06:23 +02002283 except FileExistsError:
2284 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002285
2286 def makefile(self, tarinfo, targetpath):
2287 """Make a file called targetpath.
2288 """
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002289 source = self.fileobj
2290 source.seek(tarinfo.offset_data)
Guido van Rossume7ba4952007-06-06 23:52:48 +00002291 target = bltn_open(targetpath, "wb")
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002292 if tarinfo.sparse is not None:
2293 for offset, size in tarinfo.sparse:
2294 target.seek(offset)
2295 copyfileobj(source, target, size)
2296 else:
2297 copyfileobj(source, target, tarinfo.size)
2298 target.seek(tarinfo.size)
2299 target.truncate()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002300 target.close()
2301
2302 def makeunknown(self, tarinfo, targetpath):
2303 """Make a file from a TarInfo object with an unknown type
2304 at targetpath.
2305 """
2306 self.makefile(tarinfo, targetpath)
2307 self._dbg(1, "tarfile: Unknown file type %r, " \
2308 "extracted as regular file." % tarinfo.type)
2309
2310 def makefifo(self, tarinfo, targetpath):
2311 """Make a fifo called targetpath.
2312 """
2313 if hasattr(os, "mkfifo"):
2314 os.mkfifo(targetpath)
2315 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002316 raise ExtractError("fifo not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002317
2318 def makedev(self, tarinfo, targetpath):
2319 """Make a character or block device called targetpath.
2320 """
2321 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00002322 raise ExtractError("special devices not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002323
2324 mode = tarinfo.mode
2325 if tarinfo.isblk():
2326 mode |= stat.S_IFBLK
2327 else:
2328 mode |= stat.S_IFCHR
2329
2330 os.mknod(targetpath, mode,
2331 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2332
2333 def makelink(self, tarinfo, targetpath):
2334 """Make a (symbolic) link called targetpath. If it cannot be created
2335 (platform limitation), we try to make a copy of the referenced file
2336 instead of a link.
2337 """
Brian Curtind40e6f72010-07-08 21:39:08 +00002338 try:
Lars Gustäbel1b512722010-06-03 12:45:16 +00002339 # For systems that support symbolic and hard links.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002340 if tarinfo.issym():
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002341 os.symlink(tarinfo.linkname, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002342 else:
Neal Norwitza4f651a2004-07-20 22:07:44 +00002343 # See extract().
Lars Gustäbel1b512722010-06-03 12:45:16 +00002344 if os.path.exists(tarinfo._link_target):
2345 os.link(tarinfo._link_target, targetpath)
2346 else:
Brian Curtin82df53e2010-09-24 21:04:05 +00002347 self._extract_member(self._find_link_target(tarinfo),
2348 targetpath)
Brian Curtin16633fa2010-07-09 13:54:27 +00002349 except symlink_exception:
Brian Curtind40e6f72010-07-08 21:39:08 +00002350 if tarinfo.issym():
Brian Curtin16633fa2010-07-09 13:54:27 +00002351 linkpath = os.path.join(os.path.dirname(tarinfo.name),
2352 tarinfo.linkname)
Brian Curtind40e6f72010-07-08 21:39:08 +00002353 else:
2354 linkpath = tarinfo.linkname
Lars Gustäbel1b512722010-06-03 12:45:16 +00002355 else:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002356 try:
Brian Curtin16633fa2010-07-09 13:54:27 +00002357 self._extract_member(self._find_link_target(tarinfo),
2358 targetpath)
Lars Gustäbel1b512722010-06-03 12:45:16 +00002359 except KeyError:
2360 raise ExtractError("unable to resolve link inside archive")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002361
2362 def chown(self, tarinfo, targetpath):
2363 """Set owner of targetpath according to tarinfo.
2364 """
2365 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2366 # We have to be root to do so.
2367 try:
2368 g = grp.getgrnam(tarinfo.gname)[2]
2369 except KeyError:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002370 g = tarinfo.gid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002371 try:
2372 u = pwd.getpwnam(tarinfo.uname)[2]
2373 except KeyError:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002374 u = tarinfo.uid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002375 try:
2376 if tarinfo.issym() and hasattr(os, "lchown"):
2377 os.lchown(targetpath, u, g)
2378 else:
Andrew MacIntyre7970d202003-02-19 12:51:34 +00002379 if sys.platform != "os2emx":
2380 os.chown(targetpath, u, g)
Guido van Rossumb940e112007-01-10 16:19:56 +00002381 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002382 raise ExtractError("could not change owner")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002383
2384 def chmod(self, tarinfo, targetpath):
2385 """Set file permissions of targetpath according to tarinfo.
2386 """
Jack Jansen834eff62003-03-07 12:47:06 +00002387 if hasattr(os, 'chmod'):
2388 try:
2389 os.chmod(targetpath, tarinfo.mode)
Guido van Rossumb940e112007-01-10 16:19:56 +00002390 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002391 raise ExtractError("could not change mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002392
2393 def utime(self, tarinfo, targetpath):
2394 """Set modification time of targetpath according to tarinfo.
2395 """
Jack Jansen834eff62003-03-07 12:47:06 +00002396 if not hasattr(os, 'utime'):
Tim Petersf9347782003-03-07 15:36:41 +00002397 return
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002398 try:
2399 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
Guido van Rossumb940e112007-01-10 16:19:56 +00002400 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002401 raise ExtractError("could not change modification time")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002402
2403 #--------------------------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002404 def next(self):
2405 """Return the next member of the archive as a TarInfo object, when
2406 TarFile is opened for reading. Return None if there is no more
2407 available.
2408 """
2409 self._check("ra")
2410 if self.firstmember is not None:
2411 m = self.firstmember
2412 self.firstmember = None
2413 return m
2414
2415 # Read the next block.
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002416 self.fileobj.seek(self.offset)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002417 tarinfo = None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002418 while True:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002419 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00002420 tarinfo = self.tarinfo.fromtarfile(self)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002421 except EOFHeaderError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002422 if self.ignore_zeros:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002423 self._dbg(2, "0x%X: %s" % (self.offset, e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002424 self.offset += BLOCKSIZE
2425 continue
Lars Gustäbel9520a432009-11-22 18:48:49 +00002426 except InvalidHeaderError as e:
2427 if self.ignore_zeros:
2428 self._dbg(2, "0x%X: %s" % (self.offset, e))
2429 self.offset += BLOCKSIZE
2430 continue
2431 elif self.offset == 0:
2432 raise ReadError(str(e))
2433 except EmptyHeaderError:
2434 if self.offset == 0:
2435 raise ReadError("empty file")
2436 except TruncatedHeaderError as e:
2437 if self.offset == 0:
2438 raise ReadError(str(e))
2439 except SubsequentHeaderError as e:
2440 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002441 break
2442
Lars Gustäbel9520a432009-11-22 18:48:49 +00002443 if tarinfo is not None:
2444 self.members.append(tarinfo)
2445 else:
2446 self._loaded = True
2447
Thomas Wouters477c8d52006-05-27 19:21:47 +00002448 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002449
2450 #--------------------------------------------------------------------------
2451 # Little helper methods:
2452
Lars Gustäbel1b512722010-06-03 12:45:16 +00002453 def _getmember(self, name, tarinfo=None, normalize=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002454 """Find an archive member by name from bottom to top.
2455 If tarinfo is given, it is used as the starting point.
2456 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002457 # Ensure that all members have been loaded.
2458 members = self.getmembers()
2459
Lars Gustäbel1b512722010-06-03 12:45:16 +00002460 # Limit the member search list up to tarinfo.
2461 if tarinfo is not None:
2462 members = members[:members.index(tarinfo)]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002463
Lars Gustäbel1b512722010-06-03 12:45:16 +00002464 if normalize:
2465 name = os.path.normpath(name)
2466
2467 for member in reversed(members):
2468 if normalize:
2469 member_name = os.path.normpath(member.name)
2470 else:
2471 member_name = member.name
2472
2473 if name == member_name:
2474 return member
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002475
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002476 def _load(self):
2477 """Read through the entire archive file and look for readable
2478 members.
2479 """
2480 while True:
2481 tarinfo = self.next()
2482 if tarinfo is None:
2483 break
2484 self._loaded = True
2485
2486 def _check(self, mode=None):
2487 """Check if TarFile is still open, and if the operation's mode
2488 corresponds to TarFile's mode.
2489 """
2490 if self.closed:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002491 raise IOError("%s is closed" % self.__class__.__name__)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002492 if mode is not None and self.mode not in mode:
2493 raise IOError("bad operation for mode %r" % self.mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002494
Lars Gustäbel1b512722010-06-03 12:45:16 +00002495 def _find_link_target(self, tarinfo):
2496 """Find the target member of a symlink or hardlink member in the
2497 archive.
2498 """
2499 if tarinfo.issym():
2500 # Always search the entire archive.
2501 linkname = os.path.dirname(tarinfo.name) + "/" + tarinfo.linkname
2502 limit = None
2503 else:
2504 # Search the archive before the link, because a hard link is
2505 # just a reference to an already archived file.
2506 linkname = tarinfo.linkname
2507 limit = tarinfo
2508
2509 member = self._getmember(linkname, tarinfo=limit, normalize=True)
2510 if member is None:
2511 raise KeyError("linkname %r not found" % linkname)
2512 return member
2513
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002514 def __iter__(self):
2515 """Provide an iterator object.
2516 """
2517 if self._loaded:
2518 return iter(self.members)
2519 else:
2520 return TarIter(self)
2521
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002522 def _dbg(self, level, msg):
2523 """Write debugging output to sys.stderr.
2524 """
2525 if level <= self.debug:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002526 print(msg, file=sys.stderr)
Lars Gustäbel01385812010-03-03 12:08:54 +00002527
2528 def __enter__(self):
2529 self._check()
2530 return self
2531
2532 def __exit__(self, type, value, traceback):
2533 if type is None:
2534 self.close()
2535 else:
2536 # An exception occurred. We must not call close() because
2537 # it would try to write end-of-archive blocks and padding.
2538 if not self._extfileobj:
2539 self.fileobj.close()
2540 self.closed = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002541# class TarFile
2542
2543class TarIter:
2544 """Iterator Class.
2545
2546 for tarinfo in TarFile(...):
2547 suite...
2548 """
2549
2550 def __init__(self, tarfile):
2551 """Construct a TarIter object.
2552 """
2553 self.tarfile = tarfile
Martin v. Löwis637431b2005-03-03 23:12:42 +00002554 self.index = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002555 def __iter__(self):
2556 """Return iterator object.
2557 """
2558 return self
Georg Brandla18af4e2007-04-21 15:47:16 +00002559 def __next__(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002560 """Return the next item using TarFile's next() method.
2561 When all members have been read, set TarFile as _loaded.
2562 """
Martin v. Löwis637431b2005-03-03 23:12:42 +00002563 # Fix for SF #1100429: Under rare circumstances it can
2564 # happen that getmembers() is called during iteration,
2565 # which will cause TarIter to stop prematurely.
2566 if not self.tarfile._loaded:
2567 tarinfo = self.tarfile.next()
2568 if not tarinfo:
2569 self.tarfile._loaded = True
2570 raise StopIteration
2571 else:
2572 try:
2573 tarinfo = self.tarfile.members[self.index]
2574 except IndexError:
2575 raise StopIteration
2576 self.index += 1
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002577 return tarinfo
2578
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002579#--------------------
2580# exported functions
2581#--------------------
2582def is_tarfile(name):
2583 """Return True if name points to a tar archive that we
2584 are able to handle, else return False.
2585 """
2586 try:
2587 t = open(name)
2588 t.close()
2589 return True
2590 except TarError:
2591 return False
2592
Guido van Rossume7ba4952007-06-06 23:52:48 +00002593bltn_open = open
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002594open = TarFile.open