blob: 39fe635049c0bd7043a97ea63bb0dfd94867f29a [file] [log] [blame]
Benjamin Peterson90f5ba52010-03-11 22:53:45 +00001#!/usr/bin/env python3
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
Christian Heimes9c1257e2007-11-04 11:37:22 +00005# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00006# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
Guido van Rossumd8faa362007-04-27 19:54:29 +000032version = "0.9.0"
Guido van Rossum98297ee2007-11-06 21:34:58 +000033__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)"
Senthil Kumaran7c9719c2011-07-28 22:32:49 +080034__date__ = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $"
35__cvsid__ = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $"
Guido van Rossum98297ee2007-11-06 21:34:58 +000036__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000037
38#---------
39# Imports
40#---------
41import sys
42import os
43import shutil
44import stat
45import errno
46import time
47import struct
Thomas Wouters89f507f2006-12-13 04:49:30 +000048import copy
Guido van Rossumd8faa362007-04-27 19:54:29 +000049import re
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000050
51try:
52 import grp, pwd
53except ImportError:
54 grp = pwd = None
55
Brian Curtin16633fa2010-07-09 13:54:27 +000056# os.symlink on Windows prior to 6.0 raises NotImplementedError
57symlink_exception = (AttributeError, NotImplementedError)
58try:
59 # WindowsError (1314) will be raised if the caller does not hold the
60 # SeCreateSymbolicLinkPrivilege privilege
61 symlink_exception += (WindowsError,)
62except NameError:
63 pass
64
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000065# from tarfile import *
66__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
67
Georg Brandl1a3284e2007-12-02 09:40:06 +000068from builtins import open as _open # Since 'open' is TarFile.open
Guido van Rossum8f78fe92006-08-24 04:03:53 +000069
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000070#---------------------------------------------------------
71# tar constants
72#---------------------------------------------------------
Lars Gustäbelb506dc32007-08-07 18:36:16 +000073NUL = b"\0" # the null character
Guido van Rossumd8faa362007-04-27 19:54:29 +000074BLOCKSIZE = 512 # length of processing blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000075RECORDSIZE = BLOCKSIZE * 20 # length of records
Lars Gustäbelb506dc32007-08-07 18:36:16 +000076GNU_MAGIC = b"ustar \0" # magic gnu tar string
77POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000078
Guido van Rossumd8faa362007-04-27 19:54:29 +000079LENGTH_NAME = 100 # maximum length of a filename
80LENGTH_LINK = 100 # maximum length of a linkname
81LENGTH_PREFIX = 155 # maximum length of the prefix field
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000082
Lars Gustäbelb506dc32007-08-07 18:36:16 +000083REGTYPE = b"0" # regular file
84AREGTYPE = b"\0" # regular file
85LNKTYPE = b"1" # link (inside tarfile)
86SYMTYPE = b"2" # symbolic link
87CHRTYPE = b"3" # character special device
88BLKTYPE = b"4" # block special device
89DIRTYPE = b"5" # directory
90FIFOTYPE = b"6" # fifo special device
91CONTTYPE = b"7" # contiguous file
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000092
Lars Gustäbelb506dc32007-08-07 18:36:16 +000093GNUTYPE_LONGNAME = b"L" # GNU tar longname
94GNUTYPE_LONGLINK = b"K" # GNU tar longlink
95GNUTYPE_SPARSE = b"S" # GNU tar sparse file
Guido van Rossumd8faa362007-04-27 19:54:29 +000096
Lars Gustäbelb506dc32007-08-07 18:36:16 +000097XHDTYPE = b"x" # POSIX.1-2001 extended header
98XGLTYPE = b"g" # POSIX.1-2001 global header
99SOLARIS_XHDTYPE = b"X" # Solaris extended header
Guido van Rossumd8faa362007-04-27 19:54:29 +0000100
101USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
102GNU_FORMAT = 1 # GNU tar format
103PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
104DEFAULT_FORMAT = GNU_FORMAT
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000105
106#---------------------------------------------------------
107# tarfile constants
108#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000109# File types that tarfile supports:
110SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
111 SYMTYPE, DIRTYPE, FIFOTYPE,
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000112 CONTTYPE, CHRTYPE, BLKTYPE,
113 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
114 GNUTYPE_SPARSE)
115
Guido van Rossumd8faa362007-04-27 19:54:29 +0000116# File types that will be treated as a regular file.
117REGULAR_TYPES = (REGTYPE, AREGTYPE,
118 CONTTYPE, GNUTYPE_SPARSE)
119
120# File types that are part of the GNU tar format.
121GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
122 GNUTYPE_SPARSE)
123
124# Fields from a pax header that override a TarInfo attribute.
125PAX_FIELDS = ("path", "linkpath", "size", "mtime",
126 "uid", "gid", "uname", "gname")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000127
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000128# Fields from a pax header that are affected by hdrcharset.
129PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
130
Guido van Rossume7ba4952007-06-06 23:52:48 +0000131# Fields in a pax header that are numbers, all other fields
132# are treated as strings.
133PAX_NUMBER_FIELDS = {
134 "atime": float,
135 "ctime": float,
136 "mtime": float,
137 "uid": int,
138 "gid": int,
139 "size": int
140}
141
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000142#---------------------------------------------------------
143# Bits used in the mode field, values in octal.
144#---------------------------------------------------------
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000145S_IFLNK = 0o120000 # symbolic link
146S_IFREG = 0o100000 # regular file
147S_IFBLK = 0o060000 # block device
148S_IFDIR = 0o040000 # directory
149S_IFCHR = 0o020000 # character device
150S_IFIFO = 0o010000 # fifo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000151
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000152TSUID = 0o4000 # set UID on execution
153TSGID = 0o2000 # set GID on execution
154TSVTX = 0o1000 # reserved
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000155
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000156TUREAD = 0o400 # read by owner
157TUWRITE = 0o200 # write by owner
158TUEXEC = 0o100 # execute/search by owner
159TGREAD = 0o040 # read by group
160TGWRITE = 0o020 # write by group
161TGEXEC = 0o010 # execute/search by group
162TOREAD = 0o004 # read by other
163TOWRITE = 0o002 # write by other
164TOEXEC = 0o001 # execute/search by other
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000165
166#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000167# initialization
168#---------------------------------------------------------
Victor Stinner0f35e2c2010-06-11 23:46:47 +0000169if os.name in ("nt", "ce"):
170 ENCODING = "utf-8"
171else:
172 ENCODING = sys.getfilesystemencoding()
Guido van Rossumd8faa362007-04-27 19:54:29 +0000173
174#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000175# Some useful functions
176#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000177
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000178def stn(s, length, encoding, errors):
179 """Convert a string to a null-terminated bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000180 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000181 s = s.encode(encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000182 return s[:length] + (length - len(s)) * NUL
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000184def nts(s, encoding, errors):
185 """Convert a null-terminated bytes object to a string.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000186 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000187 p = s.find(b"\0")
188 if p != -1:
189 s = s[:p]
190 return s.decode(encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000191
Thomas Wouters477c8d52006-05-27 19:21:47 +0000192def nti(s):
193 """Convert a number field to a python number.
194 """
195 # There are two possible encodings for a number field, see
196 # itn() below.
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200197 if s[0] in (0o200, 0o377):
198 n = 0
199 for i in range(len(s) - 1):
200 n <<= 8
201 n += s[i + 1]
202 if s[0] == 0o377:
203 n = -(256 ** (len(s) - 1) - n)
204 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000205 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000206 n = int(nts(s, "ascii", "strict") or "0", 8)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000207 except ValueError:
Lars Gustäbel9520a432009-11-22 18:48:49 +0000208 raise InvalidHeaderError("invalid header")
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209 return n
210
Guido van Rossumd8faa362007-04-27 19:54:29 +0000211def itn(n, digits=8, format=DEFAULT_FORMAT):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000212 """Convert a python number to a number field.
213 """
214 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
215 # octal digits followed by a null-byte, this allows values up to
216 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200217 # that if necessary. A leading 0o200 or 0o377 byte indicate this
218 # particular encoding, the following digits-1 bytes are a big-endian
219 # base-256 representation. This allows values up to (256**(digits-1))-1.
220 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
221 # number.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000222 if 0 <= n < 8 ** (digits - 1):
Lars Gustäbela280ca752007-08-28 07:34:33 +0000223 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200224 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
225 if n >= 0:
226 s = bytearray([0o200])
227 else:
228 s = bytearray([0o377])
229 n = 256 ** digits + n
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230
Guido van Rossum805365e2007-05-07 22:24:25 +0000231 for i in range(digits - 1):
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200232 s.insert(1, n & 0o377)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233 n >>= 8
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200234 else:
235 raise ValueError("overflow in number field")
236
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237 return s
238
239def calc_chksums(buf):
240 """Calculate the checksum for a member's header by summing up all
241 characters except for the chksum field which is treated as if
242 it was filled with spaces. According to the GNU tar sources,
243 some tars (Sun and NeXT) calculate chksum with signed char,
244 which will be different if there are chars in the buffer with
245 the high bit set. So we calculate two checksums, unsigned and
246 signed.
247 """
248 unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
249 signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
250 return unsigned_chksum, signed_chksum
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000251
252def copyfileobj(src, dst, length=None):
253 """Copy length bytes from fileobj src to fileobj dst.
254 If length is None, copy the entire content.
255 """
256 if length == 0:
257 return
258 if length is None:
259 shutil.copyfileobj(src, dst)
260 return
261
262 BUFSIZE = 16 * 1024
263 blocks, remainder = divmod(length, BUFSIZE)
Guido van Rossum805365e2007-05-07 22:24:25 +0000264 for b in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000265 buf = src.read(BUFSIZE)
266 if len(buf) < BUFSIZE:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000268 dst.write(buf)
269
270 if remainder != 0:
271 buf = src.read(remainder)
272 if len(buf) < remainder:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000274 dst.write(buf)
275 return
276
277filemode_table = (
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000278 ((S_IFLNK, "l"),
279 (S_IFREG, "-"),
280 (S_IFBLK, "b"),
281 (S_IFDIR, "d"),
282 (S_IFCHR, "c"),
283 (S_IFIFO, "p")),
284
285 ((TUREAD, "r"),),
286 ((TUWRITE, "w"),),
287 ((TUEXEC|TSUID, "s"),
288 (TSUID, "S"),
289 (TUEXEC, "x")),
290
291 ((TGREAD, "r"),),
292 ((TGWRITE, "w"),),
293 ((TGEXEC|TSGID, "s"),
294 (TSGID, "S"),
295 (TGEXEC, "x")),
296
297 ((TOREAD, "r"),),
298 ((TOWRITE, "w"),),
299 ((TOEXEC|TSVTX, "t"),
300 (TSVTX, "T"),
301 (TOEXEC, "x"))
302)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000303
304def filemode(mode):
305 """Convert a file's mode to a string of the form
306 -rwxrwxrwx.
307 Used by TarFile.list()
308 """
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000309 perm = []
310 for table in filemode_table:
311 for bit, char in table:
312 if mode & bit == bit:
313 perm.append(char)
314 break
315 else:
316 perm.append("-")
317 return "".join(perm)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000318
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000319class TarError(Exception):
320 """Base exception."""
321 pass
322class ExtractError(TarError):
323 """General exception for extract errors."""
324 pass
325class ReadError(TarError):
326 """Exception for unreadble tar archives."""
327 pass
328class CompressionError(TarError):
329 """Exception for unavailable compression methods."""
330 pass
331class StreamError(TarError):
332 """Exception for unsupported operations on stream-like TarFiles."""
333 pass
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000334class HeaderError(TarError):
Lars Gustäbel9520a432009-11-22 18:48:49 +0000335 """Base exception for header errors."""
336 pass
337class EmptyHeaderError(HeaderError):
338 """Exception for empty headers."""
339 pass
340class TruncatedHeaderError(HeaderError):
341 """Exception for truncated headers."""
342 pass
343class EOFHeaderError(HeaderError):
344 """Exception for end of file headers."""
345 pass
346class InvalidHeaderError(HeaderError):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000347 """Exception for invalid headers."""
348 pass
Lars Gustäbel9520a432009-11-22 18:48:49 +0000349class SubsequentHeaderError(HeaderError):
350 """Exception for missing and invalid extended headers."""
351 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000352
353#---------------------------
354# internal stream interface
355#---------------------------
356class _LowLevelFile:
357 """Low-level file object. Supports reading and writing.
358 It is used instead of a regular file object for streaming
359 access.
360 """
361
362 def __init__(self, name, mode):
363 mode = {
364 "r": os.O_RDONLY,
365 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
366 }[mode]
367 if hasattr(os, "O_BINARY"):
368 mode |= os.O_BINARY
Lars Gustäbeld6eb70b2010-04-29 15:37:02 +0000369 self.fd = os.open(name, mode, 0o666)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000370
371 def close(self):
372 os.close(self.fd)
373
374 def read(self, size):
375 return os.read(self.fd, size)
376
377 def write(self, s):
378 os.write(self.fd, s)
379
380class _Stream:
381 """Class that serves as an adapter between TarFile and
382 a stream-like object. The stream-like object only
383 needs to have a read() or write() method and is accessed
384 blockwise. Use of gzip or bzip2 compression is possible.
385 A stream-like object could be for example: sys.stdin,
386 sys.stdout, a socket, a tape device etc.
387
388 _Stream is intended to be used only internally.
389 """
390
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000391 def __init__(self, name, mode, comptype, fileobj, bufsize):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000392 """Construct a _Stream object.
393 """
394 self._extfileobj = True
395 if fileobj is None:
396 fileobj = _LowLevelFile(name, mode)
397 self._extfileobj = False
398
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000399 if comptype == '*':
400 # Enable transparent compression detection for the
401 # stream interface
402 fileobj = _StreamProxy(fileobj)
403 comptype = fileobj.getcomptype()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000404
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000405 self.name = name or ""
406 self.mode = mode
407 self.comptype = comptype
408 self.fileobj = fileobj
409 self.bufsize = bufsize
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000410 self.buf = b""
Guido van Rossume2a383d2007-01-15 16:59:06 +0000411 self.pos = 0
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000412 self.closed = False
413
Antoine Pitrou605c2932010-09-23 20:15:14 +0000414 try:
415 if comptype == "gz":
416 try:
417 import zlib
418 except ImportError:
419 raise CompressionError("zlib module is not available")
420 self.zlib = zlib
421 self.crc = zlib.crc32(b"")
422 if mode == "r":
423 self._init_read_gz()
424 else:
425 self._init_write_gz()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000426
Antoine Pitrou605c2932010-09-23 20:15:14 +0000427 if comptype == "bz2":
428 try:
429 import bz2
430 except ImportError:
431 raise CompressionError("bz2 module is not available")
432 if mode == "r":
433 self.dbuf = b""
434 self.cmp = bz2.BZ2Decompressor()
435 else:
436 self.cmp = bz2.BZ2Compressor()
437 except:
438 if not self._extfileobj:
439 self.fileobj.close()
440 self.closed = True
441 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000442
443 def __del__(self):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000444 if hasattr(self, "closed") and not self.closed:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000445 self.close()
446
447 def _init_write_gz(self):
448 """Initialize for writing with gzip compression.
449 """
450 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
451 -self.zlib.MAX_WBITS,
452 self.zlib.DEF_MEM_LEVEL,
453 0)
Guido van Rossume2a383d2007-01-15 16:59:06 +0000454 timestamp = struct.pack("<L", int(time.time()))
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000455 self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000456 if self.name.endswith(".gz"):
457 self.name = self.name[:-3]
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000458 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
459 self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000460
461 def write(self, s):
462 """Write string s to the stream.
463 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000464 if self.comptype == "gz":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000465 self.crc = self.zlib.crc32(s, self.crc)
466 self.pos += len(s)
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000467 if self.comptype != "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000468 s = self.cmp.compress(s)
469 self.__write(s)
470
471 def __write(self, s):
472 """Write string s to the stream if a whole new block
473 is ready to be written.
474 """
475 self.buf += s
476 while len(self.buf) > self.bufsize:
477 self.fileobj.write(self.buf[:self.bufsize])
478 self.buf = self.buf[self.bufsize:]
479
480 def close(self):
481 """Close the _Stream object. No operation should be
482 done on it afterwards.
483 """
484 if self.closed:
485 return
486
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000487 if self.mode == "w" and self.comptype != "tar":
Martin v. Löwisc234a522004-08-22 21:28:33 +0000488 self.buf += self.cmp.flush()
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000489
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000490 if self.mode == "w" and self.buf:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000491 self.fileobj.write(self.buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000492 self.buf = b""
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000493 if self.comptype == "gz":
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000494 # The native zlib crc is an unsigned 32-bit integer, but
495 # the Python wrapper implicitly casts that to a signed C
496 # long. So, on a 32-bit box self.crc may "look negative",
497 # while the same crc on a 64-bit box may "look positive".
498 # To avoid irksome warnings from the `struct` module, force
499 # it to look positive on all boxes.
Guido van Rossume2a383d2007-01-15 16:59:06 +0000500 self.fileobj.write(struct.pack("<L", self.crc & 0xffffffff))
501 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000502
503 if not self._extfileobj:
504 self.fileobj.close()
505
506 self.closed = True
507
508 def _init_read_gz(self):
509 """Initialize for reading a gzip compressed fileobj.
510 """
511 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000512 self.dbuf = b""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000513
514 # taken from gzip.GzipFile with some alterations
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000515 if self.__read(2) != b"\037\213":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000516 raise ReadError("not a gzip file")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000517 if self.__read(1) != b"\010":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000518 raise CompressionError("unsupported compression method")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000519
520 flag = ord(self.__read(1))
521 self.__read(6)
522
523 if flag & 4:
524 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
525 self.read(xlen)
526 if flag & 8:
527 while True:
528 s = self.__read(1)
529 if not s or s == NUL:
530 break
531 if flag & 16:
532 while True:
533 s = self.__read(1)
534 if not s or s == NUL:
535 break
536 if flag & 2:
537 self.__read(2)
538
539 def tell(self):
540 """Return the stream's file pointer position.
541 """
542 return self.pos
543
544 def seek(self, pos=0):
545 """Set the stream's file pointer to pos. Negative seeking
546 is forbidden.
547 """
548 if pos - self.pos >= 0:
549 blocks, remainder = divmod(pos - self.pos, self.bufsize)
Guido van Rossum805365e2007-05-07 22:24:25 +0000550 for i in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000551 self.read(self.bufsize)
552 self.read(remainder)
553 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554 raise StreamError("seeking backwards is not allowed")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000555 return self.pos
556
557 def read(self, size=None):
558 """Return the next size number of bytes from the stream.
559 If size is not defined, return all bytes of the stream
560 up to EOF.
561 """
562 if size is None:
563 t = []
564 while True:
565 buf = self._read(self.bufsize)
566 if not buf:
567 break
568 t.append(buf)
569 buf = "".join(t)
570 else:
571 buf = self._read(size)
572 self.pos += len(buf)
573 return buf
574
575 def _read(self, size):
576 """Return size bytes from the stream.
577 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000578 if self.comptype == "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000579 return self.__read(size)
580
581 c = len(self.dbuf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000582 while c < size:
583 buf = self.__read(self.bufsize)
584 if not buf:
585 break
Guido van Rossumd8faa362007-04-27 19:54:29 +0000586 try:
587 buf = self.cmp.decompress(buf)
588 except IOError:
589 raise ReadError("invalid compressed data")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000590 self.dbuf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000591 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000592 buf = self.dbuf[:size]
593 self.dbuf = self.dbuf[size:]
594 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000595
596 def __read(self, size):
597 """Return size bytes from stream. If internal buffer is empty,
598 read another block from the stream.
599 """
600 c = len(self.buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000601 while c < size:
602 buf = self.fileobj.read(self.bufsize)
603 if not buf:
604 break
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000605 self.buf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000606 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000607 buf = self.buf[:size]
608 self.buf = self.buf[size:]
609 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000610# class _Stream
611
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000612class _StreamProxy(object):
613 """Small proxy class that enables transparent compression
614 detection for the Stream interface (mode 'r|*').
615 """
616
617 def __init__(self, fileobj):
618 self.fileobj = fileobj
619 self.buf = self.fileobj.read(BLOCKSIZE)
620
621 def read(self, size):
622 self.read = self.fileobj.read
623 return self.buf
624
625 def getcomptype(self):
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000626 if self.buf.startswith(b"\037\213\010"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000627 return "gz"
Lars Gustäbela280ca752007-08-28 07:34:33 +0000628 if self.buf.startswith(b"BZh91"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000629 return "bz2"
630 return "tar"
631
632 def close(self):
633 self.fileobj.close()
634# class StreamProxy
635
Thomas Wouters477c8d52006-05-27 19:21:47 +0000636class _BZ2Proxy(object):
637 """Small proxy class that enables external file object
638 support for "r:bz2" and "w:bz2" modes. This is actually
639 a workaround for a limitation in bz2 module's BZ2File
640 class which (unlike gzip.GzipFile) has no support for
641 a file object argument.
642 """
643
644 blocksize = 16 * 1024
645
646 def __init__(self, fileobj, mode):
647 self.fileobj = fileobj
648 self.mode = mode
Guido van Rossumd8faa362007-04-27 19:54:29 +0000649 self.name = getattr(self.fileobj, "name", None)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000650 self.init()
651
652 def init(self):
653 import bz2
654 self.pos = 0
655 if self.mode == "r":
656 self.bz2obj = bz2.BZ2Decompressor()
657 self.fileobj.seek(0)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000658 self.buf = b""
Thomas Wouters477c8d52006-05-27 19:21:47 +0000659 else:
660 self.bz2obj = bz2.BZ2Compressor()
661
662 def read(self, size):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000663 x = len(self.buf)
664 while x < size:
Lars Gustäbel42e00912009-03-22 20:34:29 +0000665 raw = self.fileobj.read(self.blocksize)
666 if not raw:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000667 break
Lars Gustäbel42e00912009-03-22 20:34:29 +0000668 data = self.bz2obj.decompress(raw)
669 self.buf += data
Thomas Wouters477c8d52006-05-27 19:21:47 +0000670 x += len(data)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000671
672 buf = self.buf[:size]
673 self.buf = self.buf[size:]
674 self.pos += len(buf)
675 return buf
676
677 def seek(self, pos):
678 if pos < self.pos:
679 self.init()
680 self.read(pos - self.pos)
681
682 def tell(self):
683 return self.pos
684
685 def write(self, data):
686 self.pos += len(data)
687 raw = self.bz2obj.compress(data)
688 self.fileobj.write(raw)
689
690 def close(self):
691 if self.mode == "w":
692 raw = self.bz2obj.flush()
693 self.fileobj.write(raw)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000694# class _BZ2Proxy
695
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000696#------------------------
697# Extraction file object
698#------------------------
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000699class _FileInFile(object):
700 """A thin wrapper around an existing file object that
701 provides a part of its data as an individual file
702 object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000703 """
704
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000705 def __init__(self, fileobj, offset, size, blockinfo=None):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000706 self.fileobj = fileobj
707 self.offset = offset
708 self.size = size
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000709 self.position = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000710
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000711 if blockinfo is None:
712 blockinfo = [(0, size)]
713
714 # Construct a map with data and zero blocks.
715 self.map_index = 0
716 self.map = []
717 lastpos = 0
718 realpos = self.offset
719 for offset, size in blockinfo:
720 if offset > lastpos:
721 self.map.append((False, lastpos, offset, None))
722 self.map.append((True, offset, offset + size, realpos))
723 realpos += size
724 lastpos = offset + size
725 if lastpos < self.size:
726 self.map.append((False, lastpos, self.size, None))
727
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000728 def seekable(self):
729 if not hasattr(self.fileobj, "seekable"):
730 # XXX gzip.GzipFile and bz2.BZ2File
731 return True
732 return self.fileobj.seekable()
733
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000734 def tell(self):
735 """Return the current file position.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000736 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000737 return self.position
738
739 def seek(self, position):
740 """Seek to a position in the file.
741 """
742 self.position = position
743
744 def read(self, size=None):
745 """Read data from the file.
746 """
747 if size is None:
748 size = self.size - self.position
749 else:
750 size = min(size, self.size - self.position)
751
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000752 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000753 while size > 0:
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000754 while True:
755 data, start, stop, offset = self.map[self.map_index]
756 if start <= self.position < stop:
757 break
758 else:
759 self.map_index += 1
760 if self.map_index == len(self.map):
761 self.map_index = 0
762 length = min(size, stop - self.position)
763 if data:
Lars Gustäbeldd071042011-02-23 11:42:22 +0000764 self.fileobj.seek(offset + (self.position - start))
765 buf += self.fileobj.read(length)
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000766 else:
767 buf += NUL * length
768 size -= length
769 self.position += length
770 return buf
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000771#class _FileInFile
772
773
774class ExFileObject(object):
775 """File-like object for reading an archive member.
776 Is returned by TarFile.extractfile().
777 """
778 blocksize = 1024
779
780 def __init__(self, tarfile, tarinfo):
781 self.fileobj = _FileInFile(tarfile.fileobj,
782 tarinfo.offset_data,
783 tarinfo.size,
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000784 tarinfo.sparse)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000785 self.name = tarinfo.name
786 self.mode = "r"
787 self.closed = False
788 self.size = tarinfo.size
789
790 self.position = 0
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000791 self.buffer = b""
792
793 def readable(self):
794 return True
795
796 def writable(self):
797 return False
798
799 def seekable(self):
800 return self.fileobj.seekable()
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000801
802 def read(self, size=None):
803 """Read at most size bytes from the file. If size is not
804 present or None, read all data until EOF is reached.
805 """
806 if self.closed:
807 raise ValueError("I/O operation on closed file")
808
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000809 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000810 if self.buffer:
811 if size is None:
812 buf = self.buffer
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000813 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000814 else:
815 buf = self.buffer[:size]
816 self.buffer = self.buffer[size:]
817
818 if size is None:
819 buf += self.fileobj.read()
820 else:
821 buf += self.fileobj.read(size - len(buf))
822
823 self.position += len(buf)
824 return buf
825
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000826 # XXX TextIOWrapper uses the read1() method.
827 read1 = read
828
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000829 def readline(self, size=-1):
830 """Read one entire line from the file. If size is present
831 and non-negative, return a string with at most that
832 size, which may be an incomplete line.
833 """
834 if self.closed:
835 raise ValueError("I/O operation on closed file")
836
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000837 pos = self.buffer.find(b"\n") + 1
838 if pos == 0:
839 # no newline found.
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000840 while True:
841 buf = self.fileobj.read(self.blocksize)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000842 self.buffer += buf
843 if not buf or b"\n" in buf:
844 pos = self.buffer.find(b"\n") + 1
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000845 if pos == 0:
846 # no newline found.
847 pos = len(self.buffer)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000848 break
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000849
850 if size != -1:
851 pos = min(size, pos)
852
853 buf = self.buffer[:pos]
854 self.buffer = self.buffer[pos:]
855 self.position += len(buf)
856 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000857
858 def readlines(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000859 """Return a list with all remaining lines.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000860 """
861 result = []
862 while True:
863 line = self.readline()
864 if not line: break
865 result.append(line)
866 return result
867
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000868 def tell(self):
869 """Return the current file position.
870 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000871 if self.closed:
872 raise ValueError("I/O operation on closed file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000873
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000874 return self.position
875
876 def seek(self, pos, whence=os.SEEK_SET):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000877 """Seek to a position in the file.
878 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000879 if self.closed:
880 raise ValueError("I/O operation on closed file")
881
882 if whence == os.SEEK_SET:
883 self.position = min(max(pos, 0), self.size)
884 elif whence == os.SEEK_CUR:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000885 if pos < 0:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000886 self.position = max(self.position + pos, 0)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000887 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000888 self.position = min(self.position + pos, self.size)
889 elif whence == os.SEEK_END:
890 self.position = max(min(self.size + pos, self.size), 0)
891 else:
892 raise ValueError("Invalid argument")
893
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000894 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000895 self.fileobj.seek(self.position)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000896
897 def close(self):
898 """Close the file object.
899 """
900 self.closed = True
Martin v. Löwisdf241532005-03-03 08:17:42 +0000901
902 def __iter__(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000903 """Get an iterator over the file's lines.
Martin v. Löwisdf241532005-03-03 08:17:42 +0000904 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000905 while True:
906 line = self.readline()
907 if not line:
908 break
909 yield line
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000910#class ExFileObject
911
912#------------------
913# Exported Classes
914#------------------
915class TarInfo(object):
916 """Informational class which holds the details about an
917 archive member given by a tar header block.
918 TarInfo objects are returned by TarFile.getmember(),
919 TarFile.getmembers() and TarFile.gettarinfo() and are
920 usually created internally.
921 """
922
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000923 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
924 "chksum", "type", "linkname", "uname", "gname",
925 "devmajor", "devminor",
926 "offset", "offset_data", "pax_headers", "sparse",
927 "tarfile", "_sparse_structs", "_link_target")
928
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000929 def __init__(self, name=""):
930 """Construct a TarInfo object. name is the optional name
931 of the member.
932 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000933 self.name = name # member name
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000934 self.mode = 0o644 # file permissions
Thomas Wouters477c8d52006-05-27 19:21:47 +0000935 self.uid = 0 # user id
936 self.gid = 0 # group id
937 self.size = 0 # file size
938 self.mtime = 0 # modification time
939 self.chksum = 0 # header checksum
940 self.type = REGTYPE # member type
941 self.linkname = "" # link name
Lars Gustäbel331b8002010-10-04 15:18:47 +0000942 self.uname = "" # user name
943 self.gname = "" # group name
Thomas Wouters477c8d52006-05-27 19:21:47 +0000944 self.devmajor = 0 # device major number
945 self.devminor = 0 # device minor number
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000946
Thomas Wouters477c8d52006-05-27 19:21:47 +0000947 self.offset = 0 # the tar header starts here
948 self.offset_data = 0 # the file's data starts here
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000949
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000950 self.sparse = None # sparse member information
Guido van Rossumd8faa362007-04-27 19:54:29 +0000951 self.pax_headers = {} # pax header information
952
953 # In pax headers the "name" and "linkname" field are called
954 # "path" and "linkpath".
955 def _getpath(self):
956 return self.name
957 def _setpath(self, name):
958 self.name = name
959 path = property(_getpath, _setpath)
960
961 def _getlinkpath(self):
962 return self.linkname
963 def _setlinkpath(self, linkname):
964 self.linkname = linkname
965 linkpath = property(_getlinkpath, _setlinkpath)
966
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000967 def __repr__(self):
968 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
969
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000970 def get_info(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000971 """Return the TarInfo's attributes as a dictionary.
972 """
973 info = {
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000974 "name": self.name,
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000975 "mode": self.mode & 0o7777,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000976 "uid": self.uid,
977 "gid": self.gid,
978 "size": self.size,
979 "mtime": self.mtime,
980 "chksum": self.chksum,
981 "type": self.type,
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000982 "linkname": self.linkname,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000983 "uname": self.uname,
984 "gname": self.gname,
985 "devmajor": self.devmajor,
986 "devminor": self.devminor
987 }
988
989 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
990 info["name"] += "/"
991
992 return info
993
Victor Stinnerde629d42010-05-05 21:43:57 +0000994 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000995 """Return a tar header as a string of 512 byte blocks.
996 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000997 info = self.get_info()
Guido van Rossume7ba4952007-06-06 23:52:48 +0000998
Guido van Rossumd8faa362007-04-27 19:54:29 +0000999 if format == USTAR_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001000 return self.create_ustar_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001001 elif format == GNU_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001002 return self.create_gnu_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001003 elif format == PAX_FORMAT:
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001004 return self.create_pax_header(info, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001005 else:
1006 raise ValueError("invalid format")
1007
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001008 def create_ustar_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001009 """Return the object as a ustar header block.
1010 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001011 info["magic"] = POSIX_MAGIC
1012
1013 if len(info["linkname"]) > LENGTH_LINK:
1014 raise ValueError("linkname is too long")
1015
1016 if len(info["name"]) > LENGTH_NAME:
1017 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1018
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001019 return self._create_header(info, USTAR_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001020
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001021 def create_gnu_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001022 """Return the object as a GNU header block sequence.
1023 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001024 info["magic"] = GNU_MAGIC
1025
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001026 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +00001027 if len(info["linkname"]) > LENGTH_LINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001028 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001029
1030 if len(info["name"]) > LENGTH_NAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001031 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001032
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001033 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001034
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001035 def create_pax_header(self, info, encoding):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001036 """Return the object as a ustar header block. If it cannot be
1037 represented this way, prepend a pax extended header sequence
1038 with supplement information.
1039 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001040 info["magic"] = POSIX_MAGIC
1041 pax_headers = self.pax_headers.copy()
1042
1043 # Test string fields for values that exceed the field length or cannot
1044 # be represented in ASCII encoding.
1045 for name, hname, length in (
1046 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
1047 ("uname", "uname", 32), ("gname", "gname", 32)):
1048
Guido van Rossume7ba4952007-06-06 23:52:48 +00001049 if hname in pax_headers:
1050 # The pax header has priority.
1051 continue
1052
Guido van Rossumd8faa362007-04-27 19:54:29 +00001053 # Try to encode the string as ASCII.
1054 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001055 info[name].encode("ascii", "strict")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001056 except UnicodeEncodeError:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001057 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001058 continue
1059
Guido van Rossume7ba4952007-06-06 23:52:48 +00001060 if len(info[name]) > length:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001061 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001062
1063 # Test number fields for values that exceed the field limit or values
1064 # that like to be stored as float.
1065 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001066 if name in pax_headers:
1067 # The pax header has priority. Avoid overflow.
1068 info[name] = 0
1069 continue
1070
Guido van Rossumd8faa362007-04-27 19:54:29 +00001071 val = info[name]
1072 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001073 pax_headers[name] = str(val)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001074 info[name] = 0
1075
Guido van Rossume7ba4952007-06-06 23:52:48 +00001076 # Create a pax extended header if necessary.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001077 if pax_headers:
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001078 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001079 else:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001080 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +00001081
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001082 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001083
1084 @classmethod
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001085 def create_pax_global_header(cls, pax_headers):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001086 """Return the object as a pax global header block sequence.
1087 """
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001088 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001089
1090 def _posix_split_name(self, name):
1091 """Split a name longer than 100 chars into a prefix
1092 and a name part.
1093 """
1094 prefix = name[:LENGTH_PREFIX + 1]
1095 while prefix and prefix[-1] != "/":
1096 prefix = prefix[:-1]
1097
1098 name = name[len(prefix):]
1099 prefix = prefix[:-1]
1100
1101 if not prefix or len(name) > LENGTH_NAME:
1102 raise ValueError("name is too long")
1103 return prefix, name
1104
1105 @staticmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001106 def _create_header(info, format, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001107 """Return a header block. info is a dictionary with file
1108 information, format must be one of the *_FORMAT constants.
1109 """
1110 parts = [
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001111 stn(info.get("name", ""), 100, encoding, errors),
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001112 itn(info.get("mode", 0) & 0o7777, 8, format),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001113 itn(info.get("uid", 0), 8, format),
1114 itn(info.get("gid", 0), 8, format),
1115 itn(info.get("size", 0), 12, format),
1116 itn(info.get("mtime", 0), 12, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001117 b" ", # checksum field
Guido van Rossumd8faa362007-04-27 19:54:29 +00001118 info.get("type", REGTYPE),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001119 stn(info.get("linkname", ""), 100, encoding, errors),
1120 info.get("magic", POSIX_MAGIC),
Lars Gustäbel331b8002010-10-04 15:18:47 +00001121 stn(info.get("uname", ""), 32, encoding, errors),
1122 stn(info.get("gname", ""), 32, encoding, errors),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001123 itn(info.get("devmajor", 0), 8, format),
1124 itn(info.get("devminor", 0), 8, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001125 stn(info.get("prefix", ""), 155, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001126 ]
1127
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001128 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001129 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
Lars Gustäbela280ca752007-08-28 07:34:33 +00001130 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001131 return buf
1132
1133 @staticmethod
1134 def _create_payload(payload):
1135 """Return the string payload filled with zero bytes
1136 up to the next 512 byte border.
1137 """
1138 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1139 if remainder > 0:
1140 payload += (BLOCKSIZE - remainder) * NUL
1141 return payload
1142
1143 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001144 def _create_gnu_long_header(cls, name, type, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001145 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1146 for name.
1147 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001148 name = name.encode(encoding, errors) + NUL
Guido van Rossumd8faa362007-04-27 19:54:29 +00001149
1150 info = {}
1151 info["name"] = "././@LongLink"
1152 info["type"] = type
1153 info["size"] = len(name)
1154 info["magic"] = GNU_MAGIC
1155
1156 # create extended header + name blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001157 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001158 cls._create_payload(name)
1159
1160 @classmethod
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001161 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1162 """Return a POSIX.1-2008 extended or global header sequence
Guido van Rossumd8faa362007-04-27 19:54:29 +00001163 that contains a list of keyword, value pairs. The values
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001164 must be strings.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001165 """
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001166 # Check if one of the fields contains surrogate characters and thereby
1167 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1168 binary = False
1169 for keyword, value in pax_headers.items():
1170 try:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001171 value.encode("utf-8", "strict")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001172 except UnicodeEncodeError:
1173 binary = True
1174 break
1175
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001176 records = b""
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001177 if binary:
1178 # Put the hdrcharset field at the beginning of the header.
1179 records += b"21 hdrcharset=BINARY\n"
1180
Guido van Rossumd8faa362007-04-27 19:54:29 +00001181 for keyword, value in pax_headers.items():
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001182 keyword = keyword.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001183 if binary:
1184 # Try to restore the original byte representation of `value'.
1185 # Needless to say, that the encoding must match the string.
1186 value = value.encode(encoding, "surrogateescape")
1187 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001188 value = value.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001189
Guido van Rossumd8faa362007-04-27 19:54:29 +00001190 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1191 n = p = 0
1192 while True:
1193 n = l + len(str(p))
1194 if n == p:
1195 break
1196 p = n
Lars Gustäbela280ca752007-08-28 07:34:33 +00001197 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
Guido van Rossumd8faa362007-04-27 19:54:29 +00001198
1199 # We use a hardcoded "././@PaxHeader" name like star does
1200 # instead of the one that POSIX recommends.
1201 info = {}
1202 info["name"] = "././@PaxHeader"
1203 info["type"] = type
1204 info["size"] = len(records)
1205 info["magic"] = POSIX_MAGIC
1206
1207 # Create pax header + record blocks.
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001208 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001209 cls._create_payload(records)
1210
Guido van Rossum75b64e62005-01-16 00:16:11 +00001211 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001212 def frombuf(cls, buf, encoding, errors):
1213 """Construct a TarInfo object from a 512 byte bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001214 """
Lars Gustäbel9520a432009-11-22 18:48:49 +00001215 if len(buf) == 0:
1216 raise EmptyHeaderError("empty header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001217 if len(buf) != BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001218 raise TruncatedHeaderError("truncated header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001219 if buf.count(NUL) == BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001220 raise EOFHeaderError("end of file header")
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001221
1222 chksum = nti(buf[148:156])
1223 if chksum not in calc_chksums(buf):
Lars Gustäbel9520a432009-11-22 18:48:49 +00001224 raise InvalidHeaderError("bad checksum")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001225
Guido van Rossumd8faa362007-04-27 19:54:29 +00001226 obj = cls()
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001227 obj.name = nts(buf[0:100], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001228 obj.mode = nti(buf[100:108])
1229 obj.uid = nti(buf[108:116])
1230 obj.gid = nti(buf[116:124])
1231 obj.size = nti(buf[124:136])
1232 obj.mtime = nti(buf[136:148])
1233 obj.chksum = chksum
1234 obj.type = buf[156:157]
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001235 obj.linkname = nts(buf[157:257], encoding, errors)
1236 obj.uname = nts(buf[265:297], encoding, errors)
1237 obj.gname = nts(buf[297:329], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001238 obj.devmajor = nti(buf[329:337])
1239 obj.devminor = nti(buf[337:345])
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001240 prefix = nts(buf[345:500], encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001241
Guido van Rossumd8faa362007-04-27 19:54:29 +00001242 # Old V7 tar format represents a directory as a regular
1243 # file with a trailing slash.
1244 if obj.type == AREGTYPE and obj.name.endswith("/"):
1245 obj.type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001246
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001247 # The old GNU sparse format occupies some of the unused
1248 # space in the buffer for up to 4 sparse structures.
1249 # Save the them for later processing in _proc_sparse().
1250 if obj.type == GNUTYPE_SPARSE:
1251 pos = 386
1252 structs = []
1253 for i in range(4):
1254 try:
1255 offset = nti(buf[pos:pos + 12])
1256 numbytes = nti(buf[pos + 12:pos + 24])
1257 except ValueError:
1258 break
1259 structs.append((offset, numbytes))
1260 pos += 24
1261 isextended = bool(buf[482])
1262 origsize = nti(buf[483:495])
1263 obj._sparse_structs = (structs, isextended, origsize)
1264
Guido van Rossumd8faa362007-04-27 19:54:29 +00001265 # Remove redundant slashes from directories.
1266 if obj.isdir():
1267 obj.name = obj.name.rstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001268
Guido van Rossumd8faa362007-04-27 19:54:29 +00001269 # Reconstruct a ustar longname.
1270 if prefix and obj.type not in GNU_TYPES:
1271 obj.name = prefix + "/" + obj.name
1272 return obj
1273
1274 @classmethod
1275 def fromtarfile(cls, tarfile):
1276 """Return the next TarInfo object from TarFile object
1277 tarfile.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001278 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001279 buf = tarfile.fileobj.read(BLOCKSIZE)
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001280 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001281 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1282 return obj._proc_member(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001283
Guido van Rossumd8faa362007-04-27 19:54:29 +00001284 #--------------------------------------------------------------------------
1285 # The following are methods that are called depending on the type of a
1286 # member. The entry point is _proc_member() which can be overridden in a
1287 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1288 # implement the following
1289 # operations:
1290 # 1. Set self.offset_data to the position where the data blocks begin,
1291 # if there is data that follows.
1292 # 2. Set tarfile.offset to the position where the next member's header will
1293 # begin.
1294 # 3. Return self or another valid TarInfo object.
1295 def _proc_member(self, tarfile):
1296 """Choose the right processing method depending on
1297 the type and call it.
Thomas Wouters89f507f2006-12-13 04:49:30 +00001298 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001299 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1300 return self._proc_gnulong(tarfile)
1301 elif self.type == GNUTYPE_SPARSE:
1302 return self._proc_sparse(tarfile)
1303 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1304 return self._proc_pax(tarfile)
1305 else:
1306 return self._proc_builtin(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001307
Guido van Rossumd8faa362007-04-27 19:54:29 +00001308 def _proc_builtin(self, tarfile):
1309 """Process a builtin type or an unknown type which
1310 will be treated as a regular file.
1311 """
1312 self.offset_data = tarfile.fileobj.tell()
1313 offset = self.offset_data
1314 if self.isreg() or self.type not in SUPPORTED_TYPES:
1315 # Skip the following data blocks.
1316 offset += self._block(self.size)
1317 tarfile.offset = offset
Thomas Wouters89f507f2006-12-13 04:49:30 +00001318
Guido van Rossume7ba4952007-06-06 23:52:48 +00001319 # Patch the TarInfo object with saved global
Guido van Rossumd8faa362007-04-27 19:54:29 +00001320 # header information.
Guido van Rossume7ba4952007-06-06 23:52:48 +00001321 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001322
1323 return self
1324
1325 def _proc_gnulong(self, tarfile):
1326 """Process the blocks that hold a GNU longname
1327 or longlink member.
1328 """
1329 buf = tarfile.fileobj.read(self._block(self.size))
1330
1331 # Fetch the next header and process it.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001332 try:
1333 next = self.fromtarfile(tarfile)
1334 except HeaderError:
1335 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001336
1337 # Patch the TarInfo object from the next header with
1338 # the longname information.
1339 next.offset = self.offset
1340 if self.type == GNUTYPE_LONGNAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001341 next.name = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001342 elif self.type == GNUTYPE_LONGLINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001343 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001344
1345 return next
1346
1347 def _proc_sparse(self, tarfile):
1348 """Process a GNU sparse header plus extra headers.
1349 """
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001350 # We already collected some sparse structures in frombuf().
1351 structs, isextended, origsize = self._sparse_structs
1352 del self._sparse_structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001353
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001354 # Collect sparse structures from extended header blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001355 while isextended:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001356 buf = tarfile.fileobj.read(BLOCKSIZE)
1357 pos = 0
Guido van Rossum805365e2007-05-07 22:24:25 +00001358 for i in range(21):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001359 try:
1360 offset = nti(buf[pos:pos + 12])
1361 numbytes = nti(buf[pos + 12:pos + 24])
1362 except ValueError:
1363 break
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001364 if offset and numbytes:
1365 structs.append((offset, numbytes))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001366 pos += 24
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001367 isextended = bool(buf[504])
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001368 self.sparse = structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001369
1370 self.offset_data = tarfile.fileobj.tell()
1371 tarfile.offset = self.offset_data + self._block(self.size)
1372 self.size = origsize
Guido van Rossumd8faa362007-04-27 19:54:29 +00001373 return self
1374
1375 def _proc_pax(self, tarfile):
1376 """Process an extended or global header as described in
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001377 POSIX.1-2008.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001378 """
1379 # Read the header information.
1380 buf = tarfile.fileobj.read(self._block(self.size))
1381
1382 # A pax header stores supplemental information for either
1383 # the following file (extended) or all following files
1384 # (global).
1385 if self.type == XGLTYPE:
1386 pax_headers = tarfile.pax_headers
1387 else:
1388 pax_headers = tarfile.pax_headers.copy()
1389
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001390 # Check if the pax header contains a hdrcharset field. This tells us
1391 # the encoding of the path, linkpath, uname and gname fields. Normally,
1392 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1393 # implementations are allowed to store them as raw binary strings if
1394 # the translation to UTF-8 fails.
1395 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1396 if match is not None:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001397 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001398
1399 # For the time being, we don't care about anything other than "BINARY".
1400 # The only other value that is currently allowed by the standard is
1401 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1402 hdrcharset = pax_headers.get("hdrcharset")
1403 if hdrcharset == "BINARY":
1404 encoding = tarfile.encoding
1405 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001406 encoding = "utf-8"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001407
Guido van Rossumd8faa362007-04-27 19:54:29 +00001408 # Parse pax header information. A record looks like that:
1409 # "%d %s=%s\n" % (length, keyword, value). length is the size
1410 # of the complete record including the length field itself and
Guido van Rossume7ba4952007-06-06 23:52:48 +00001411 # the newline. keyword and value are both UTF-8 encoded strings.
Antoine Pitroufd036452008-08-19 17:56:33 +00001412 regex = re.compile(br"(\d+) ([^=]+)=")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001413 pos = 0
1414 while True:
1415 match = regex.match(buf, pos)
1416 if not match:
1417 break
1418
1419 length, keyword = match.groups()
1420 length = int(length)
1421 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1422
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001423 # Normally, we could just use "utf-8" as the encoding and "strict"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001424 # as the error handler, but we better not take the risk. For
1425 # example, GNU tar <= 1.23 is known to store filenames it cannot
1426 # translate to UTF-8 as raw strings (unfortunately without a
1427 # hdrcharset=BINARY header).
1428 # We first try the strict standard encoding, and if that fails we
1429 # fall back on the user's encoding and error handler.
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001430 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001431 tarfile.errors)
1432 if keyword in PAX_NAME_FIELDS:
1433 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1434 tarfile.errors)
1435 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001436 value = self._decode_pax_field(value, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001437 tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001438
1439 pax_headers[keyword] = value
1440 pos += length
1441
Guido van Rossume7ba4952007-06-06 23:52:48 +00001442 # Fetch the next header.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001443 try:
1444 next = self.fromtarfile(tarfile)
1445 except HeaderError:
1446 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001447
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001448 # Process GNU sparse information.
1449 if "GNU.sparse.map" in pax_headers:
1450 # GNU extended sparse format version 0.1.
1451 self._proc_gnusparse_01(next, pax_headers)
1452
1453 elif "GNU.sparse.size" in pax_headers:
1454 # GNU extended sparse format version 0.0.
1455 self._proc_gnusparse_00(next, pax_headers, buf)
1456
1457 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1458 # GNU extended sparse format version 1.0.
1459 self._proc_gnusparse_10(next, pax_headers, tarfile)
1460
Guido van Rossume7ba4952007-06-06 23:52:48 +00001461 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001462 # Patch the TarInfo object with the extended header info.
1463 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1464 next.offset = self.offset
1465
1466 if "size" in pax_headers:
1467 # If the extended header replaces the size field,
1468 # we need to recalculate the offset where the next
1469 # header starts.
1470 offset = next.offset_data
1471 if next.isreg() or next.type not in SUPPORTED_TYPES:
1472 offset += next._block(next.size)
1473 tarfile.offset = offset
1474
1475 return next
1476
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001477 def _proc_gnusparse_00(self, next, pax_headers, buf):
1478 """Process a GNU tar extended sparse header, version 0.0.
1479 """
1480 offsets = []
1481 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1482 offsets.append(int(match.group(1)))
1483 numbytes = []
1484 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1485 numbytes.append(int(match.group(1)))
1486 next.sparse = list(zip(offsets, numbytes))
1487
1488 def _proc_gnusparse_01(self, next, pax_headers):
1489 """Process a GNU tar extended sparse header, version 0.1.
1490 """
1491 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1492 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1493
1494 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1495 """Process a GNU tar extended sparse header, version 1.0.
1496 """
1497 fields = None
1498 sparse = []
1499 buf = tarfile.fileobj.read(BLOCKSIZE)
1500 fields, buf = buf.split(b"\n", 1)
1501 fields = int(fields)
1502 while len(sparse) < fields * 2:
1503 if b"\n" not in buf:
1504 buf += tarfile.fileobj.read(BLOCKSIZE)
1505 number, buf = buf.split(b"\n", 1)
1506 sparse.append(int(number))
1507 next.offset_data = tarfile.fileobj.tell()
1508 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1509
Guido van Rossume7ba4952007-06-06 23:52:48 +00001510 def _apply_pax_info(self, pax_headers, encoding, errors):
1511 """Replace fields with supplemental information from a previous
1512 pax extended or global header.
1513 """
1514 for keyword, value in pax_headers.items():
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001515 if keyword == "GNU.sparse.name":
1516 setattr(self, "path", value)
1517 elif keyword == "GNU.sparse.size":
1518 setattr(self, "size", int(value))
1519 elif keyword == "GNU.sparse.realsize":
1520 setattr(self, "size", int(value))
1521 elif keyword in PAX_FIELDS:
1522 if keyword in PAX_NUMBER_FIELDS:
1523 try:
1524 value = PAX_NUMBER_FIELDS[keyword](value)
1525 except ValueError:
1526 value = 0
1527 if keyword == "path":
1528 value = value.rstrip("/")
1529 setattr(self, keyword, value)
Guido van Rossume7ba4952007-06-06 23:52:48 +00001530
1531 self.pax_headers = pax_headers.copy()
Guido van Rossumd8faa362007-04-27 19:54:29 +00001532
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001533 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1534 """Decode a single field from a pax record.
1535 """
1536 try:
1537 return value.decode(encoding, "strict")
1538 except UnicodeDecodeError:
1539 return value.decode(fallback_encoding, fallback_errors)
1540
Guido van Rossumd8faa362007-04-27 19:54:29 +00001541 def _block(self, count):
1542 """Round up a byte count by BLOCKSIZE and return it,
1543 e.g. _block(834) => 1024.
1544 """
1545 blocks, remainder = divmod(count, BLOCKSIZE)
1546 if remainder:
1547 blocks += 1
1548 return blocks * BLOCKSIZE
Thomas Wouters89f507f2006-12-13 04:49:30 +00001549
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001550 def isreg(self):
1551 return self.type in REGULAR_TYPES
1552 def isfile(self):
1553 return self.isreg()
1554 def isdir(self):
1555 return self.type == DIRTYPE
1556 def issym(self):
1557 return self.type == SYMTYPE
1558 def islnk(self):
1559 return self.type == LNKTYPE
1560 def ischr(self):
1561 return self.type == CHRTYPE
1562 def isblk(self):
1563 return self.type == BLKTYPE
1564 def isfifo(self):
1565 return self.type == FIFOTYPE
1566 def issparse(self):
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001567 return self.sparse is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001568 def isdev(self):
1569 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1570# class TarInfo
1571
1572class TarFile(object):
1573 """The TarFile Class provides an interface to tar archives.
1574 """
1575
1576 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1577
1578 dereference = False # If true, add content of linked file to the
1579 # tar file, else the link.
1580
1581 ignore_zeros = False # If true, skips empty or invalid blocks and
1582 # continues processing.
1583
Lars Gustäbel365aff32009-12-13 11:42:29 +00001584 errorlevel = 1 # If 0, fatal errors only appear in debug
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001585 # messages (if debug >= 0). If > 0, errors
1586 # are passed to the caller as exceptions.
1587
Guido van Rossumd8faa362007-04-27 19:54:29 +00001588 format = DEFAULT_FORMAT # The format to use when creating an archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001589
Guido van Rossume7ba4952007-06-06 23:52:48 +00001590 encoding = ENCODING # Encoding for 8-bit character strings.
1591
1592 errors = None # Error handler for unicode conversion.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001593
Guido van Rossumd8faa362007-04-27 19:54:29 +00001594 tarinfo = TarInfo # The default TarInfo class to use.
1595
1596 fileobject = ExFileObject # The default ExFileObject class to use.
1597
1598 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1599 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
Victor Stinnerde629d42010-05-05 21:43:57 +00001600 errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001601 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1602 read from an existing archive, 'a' to append data to an existing
1603 file or 'w' to create a new file overwriting an existing one. `mode'
1604 defaults to 'r'.
1605 If `fileobj' is given, it is used for reading or writing data. If it
1606 can be determined, `mode' is overridden by `fileobj's mode.
1607 `fileobj' is not closed, when TarFile is closed.
1608 """
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001609 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001610 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001611 self.mode = mode
1612 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001613
1614 if not fileobj:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001615 if self.mode == "a" and not os.path.exists(name):
Thomas Wouterscf297e42007-02-23 15:07:44 +00001616 # Create nonexistent files in append mode.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001617 self.mode = "w"
1618 self._mode = "wb"
Guido van Rossume7ba4952007-06-06 23:52:48 +00001619 fileobj = bltn_open(name, self._mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001620 self._extfileobj = False
1621 else:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001622 if name is None and hasattr(fileobj, "name"):
1623 name = fileobj.name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001624 if hasattr(fileobj, "mode"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001625 self._mode = fileobj.mode
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001626 self._extfileobj = True
Thomas Woutersed03b412007-08-28 21:37:11 +00001627 self.name = os.path.abspath(name) if name else None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001628 self.fileobj = fileobj
1629
Guido van Rossumd8faa362007-04-27 19:54:29 +00001630 # Init attributes.
1631 if format is not None:
1632 self.format = format
1633 if tarinfo is not None:
1634 self.tarinfo = tarinfo
1635 if dereference is not None:
1636 self.dereference = dereference
1637 if ignore_zeros is not None:
1638 self.ignore_zeros = ignore_zeros
1639 if encoding is not None:
1640 self.encoding = encoding
Victor Stinnerde629d42010-05-05 21:43:57 +00001641 self.errors = errors
Guido van Rossume7ba4952007-06-06 23:52:48 +00001642
1643 if pax_headers is not None and self.format == PAX_FORMAT:
1644 self.pax_headers = pax_headers
1645 else:
1646 self.pax_headers = {}
1647
Guido van Rossumd8faa362007-04-27 19:54:29 +00001648 if debug is not None:
1649 self.debug = debug
1650 if errorlevel is not None:
1651 self.errorlevel = errorlevel
1652
1653 # Init datastructures.
Thomas Wouters477c8d52006-05-27 19:21:47 +00001654 self.closed = False
1655 self.members = [] # list of members as TarInfo objects
1656 self._loaded = False # flag if all members have been read
Christian Heimesd8654cf2007-12-02 15:22:16 +00001657 self.offset = self.fileobj.tell()
1658 # current position in the archive file
Thomas Wouters477c8d52006-05-27 19:21:47 +00001659 self.inodes = {} # dictionary caching the inodes of
1660 # archive members already added
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001661
Lars Gustäbel7b465392009-11-18 20:29:25 +00001662 try:
1663 if self.mode == "r":
1664 self.firstmember = None
1665 self.firstmember = self.next()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001666
Lars Gustäbel7b465392009-11-18 20:29:25 +00001667 if self.mode == "a":
1668 # Move to the end of the archive,
1669 # before the first empty block.
Lars Gustäbel7b465392009-11-18 20:29:25 +00001670 while True:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001671 self.fileobj.seek(self.offset)
1672 try:
1673 tarinfo = self.tarinfo.fromtarfile(self)
1674 self.members.append(tarinfo)
1675 except EOFHeaderError:
1676 self.fileobj.seek(self.offset)
Lars Gustäbel7b465392009-11-18 20:29:25 +00001677 break
Lars Gustäbel9520a432009-11-22 18:48:49 +00001678 except HeaderError as e:
1679 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001680
Lars Gustäbel7b465392009-11-18 20:29:25 +00001681 if self.mode in "aw":
1682 self._loaded = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001683
Lars Gustäbel7b465392009-11-18 20:29:25 +00001684 if self.pax_headers:
1685 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1686 self.fileobj.write(buf)
1687 self.offset += len(buf)
1688 except:
1689 if not self._extfileobj:
1690 self.fileobj.close()
1691 self.closed = True
1692 raise
Guido van Rossumd8faa362007-04-27 19:54:29 +00001693
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001694 #--------------------------------------------------------------------------
1695 # Below are the classmethods which act as alternate constructors to the
1696 # TarFile class. The open() method is the only one that is needed for
1697 # public use; it is the "super"-constructor and is able to select an
1698 # adequate "sub"-constructor for a particular compression using the mapping
1699 # from OPEN_METH.
1700 #
1701 # This concept allows one to subclass TarFile without losing the comfort of
1702 # the super-constructor. A sub-constructor is registered and made available
1703 # by adding it to the mapping in OPEN_METH.
1704
Guido van Rossum75b64e62005-01-16 00:16:11 +00001705 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001706 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001707 """Open a tar archive for reading, writing or appending. Return
1708 an appropriate TarFile class.
1709
1710 mode:
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001711 'r' or 'r:*' open for reading with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001712 'r:' open for reading exclusively uncompressed
1713 'r:gz' open for reading with gzip compression
1714 'r:bz2' open for reading with bzip2 compression
Thomas Wouterscf297e42007-02-23 15:07:44 +00001715 'a' or 'a:' open for appending, creating the file if necessary
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001716 'w' or 'w:' open for writing without compression
1717 'w:gz' open for writing with gzip compression
1718 'w:bz2' open for writing with bzip2 compression
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001719
1720 'r|*' open a stream of tar blocks with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001721 'r|' open an uncompressed stream of tar blocks for reading
1722 'r|gz' open a gzip compressed stream of tar blocks
1723 'r|bz2' open a bzip2 compressed stream of tar blocks
1724 'w|' open an uncompressed stream for writing
1725 'w|gz' open a gzip compressed stream for writing
1726 'w|bz2' open a bzip2 compressed stream for writing
1727 """
1728
1729 if not name and not fileobj:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001730 raise ValueError("nothing to open")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001731
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001732 if mode in ("r", "r:*"):
1733 # Find out which *open() is appropriate for opening the file.
1734 for comptype in cls.OPEN_METH:
1735 func = getattr(cls, cls.OPEN_METH[comptype])
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001736 if fileobj is not None:
1737 saved_pos = fileobj.tell()
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001738 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001739 return func(name, "r", fileobj, **kwargs)
1740 except (ReadError, CompressionError) as e:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001741 if fileobj is not None:
1742 fileobj.seek(saved_pos)
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001743 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001744 raise ReadError("file could not be opened successfully")
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001745
1746 elif ":" in mode:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001747 filemode, comptype = mode.split(":", 1)
1748 filemode = filemode or "r"
1749 comptype = comptype or "tar"
1750
1751 # Select the *open() function according to
1752 # given compression.
1753 if comptype in cls.OPEN_METH:
1754 func = getattr(cls, cls.OPEN_METH[comptype])
1755 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001756 raise CompressionError("unknown compression type %r" % comptype)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001757 return func(name, filemode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001758
1759 elif "|" in mode:
1760 filemode, comptype = mode.split("|", 1)
1761 filemode = filemode or "r"
1762 comptype = comptype or "tar"
1763
1764 if filemode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001765 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001766
Antoine Pitrou605c2932010-09-23 20:15:14 +00001767 stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1768 try:
1769 t = cls(name, filemode, stream, **kwargs)
1770 except:
1771 stream.close()
1772 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001773 t._extfileobj = False
1774 return t
1775
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001776 elif mode in "aw":
Guido van Rossumd8faa362007-04-27 19:54:29 +00001777 return cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001778
Thomas Wouters477c8d52006-05-27 19:21:47 +00001779 raise ValueError("undiscernible mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001780
Guido van Rossum75b64e62005-01-16 00:16:11 +00001781 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001782 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001783 """Open uncompressed tar archive name for reading or writing.
1784 """
1785 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001786 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001787 return cls(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001788
Guido van Rossum75b64e62005-01-16 00:16:11 +00001789 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001790 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001791 """Open gzip compressed tar archive name for reading or writing.
1792 Appending is not allowed.
1793 """
1794 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001795 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001796
1797 try:
1798 import gzip
Neal Norwitz4ec68242003-04-11 03:05:56 +00001799 gzip.GzipFile
1800 except (ImportError, AttributeError):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001801 raise CompressionError("gzip module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001802
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001803 extfileobj = fileobj is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001804 try:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001805 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
1806 t = cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001807 except IOError:
Georg Brandl3abb3722011-08-13 11:48:12 +02001808 if not extfileobj and fileobj is not None:
Antoine Pitrou95f55602010-09-23 18:36:46 +00001809 fileobj.close()
Georg Brandl3abb3722011-08-13 11:48:12 +02001810 if fileobj is None:
1811 raise
Thomas Wouters477c8d52006-05-27 19:21:47 +00001812 raise ReadError("not a gzip file")
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001813 except:
Georg Brandl3abb3722011-08-13 11:48:12 +02001814 if not extfileobj and fileobj is not None:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001815 fileobj.close()
1816 raise
Antoine Pitrou95f55602010-09-23 18:36:46 +00001817 t._extfileobj = extfileobj
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001818 return t
1819
Guido van Rossum75b64e62005-01-16 00:16:11 +00001820 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001821 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001822 """Open bzip2 compressed tar archive name for reading or writing.
1823 Appending is not allowed.
1824 """
1825 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001826 raise ValueError("mode must be 'r' or 'w'.")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001827
1828 try:
1829 import bz2
1830 except ImportError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001831 raise CompressionError("bz2 module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001832
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001833 if fileobj is not None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001834 fileobj = _BZ2Proxy(fileobj, mode)
1835 else:
1836 fileobj = bz2.BZ2File(name, mode, compresslevel=compresslevel)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001837
1838 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001839 t = cls.taropen(name, mode, fileobj, **kwargs)
Lars Gustäbel9520a432009-11-22 18:48:49 +00001840 except (IOError, EOFError):
Antoine Pitrou95f55602010-09-23 18:36:46 +00001841 fileobj.close()
Thomas Wouters477c8d52006-05-27 19:21:47 +00001842 raise ReadError("not a bzip2 file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001843 t._extfileobj = False
1844 return t
1845
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001846 # All *open() methods are registered here.
1847 OPEN_METH = {
1848 "tar": "taropen", # uncompressed tar
1849 "gz": "gzopen", # gzip compressed tar
1850 "bz2": "bz2open" # bzip2 compressed tar
1851 }
1852
1853 #--------------------------------------------------------------------------
1854 # The public methods which TarFile provides:
1855
1856 def close(self):
1857 """Close the TarFile. In write-mode, two finishing zero blocks are
1858 appended to the archive.
1859 """
1860 if self.closed:
1861 return
1862
Guido van Rossumd8faa362007-04-27 19:54:29 +00001863 if self.mode in "aw":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001864 self.fileobj.write(NUL * (BLOCKSIZE * 2))
1865 self.offset += (BLOCKSIZE * 2)
1866 # fill up the end with zero-blocks
1867 # (like option -b20 for tar does)
1868 blocks, remainder = divmod(self.offset, RECORDSIZE)
1869 if remainder > 0:
1870 self.fileobj.write(NUL * (RECORDSIZE - remainder))
1871
1872 if not self._extfileobj:
1873 self.fileobj.close()
1874 self.closed = True
1875
1876 def getmember(self, name):
1877 """Return a TarInfo object for member `name'. If `name' can not be
1878 found in the archive, KeyError is raised. If a member occurs more
Mark Dickinson934896d2009-02-21 20:59:32 +00001879 than once in the archive, its last occurrence is assumed to be the
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001880 most up-to-date version.
1881 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001882 tarinfo = self._getmember(name)
1883 if tarinfo is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001884 raise KeyError("filename %r not found" % name)
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001885 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001886
1887 def getmembers(self):
1888 """Return the members of the archive as a list of TarInfo objects. The
1889 list has the same order as the members in the archive.
1890 """
1891 self._check()
1892 if not self._loaded: # if we want to obtain a list of
1893 self._load() # all members, we first have to
1894 # scan the whole archive.
1895 return self.members
1896
1897 def getnames(self):
1898 """Return the members of the archive as a list of their names. It has
1899 the same order as the list returned by getmembers().
1900 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001901 return [tarinfo.name for tarinfo in self.getmembers()]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001902
1903 def gettarinfo(self, name=None, arcname=None, fileobj=None):
1904 """Create a TarInfo object for either the file `name' or the file
1905 object `fileobj' (using os.fstat on its file descriptor). You can
1906 modify some of the TarInfo's attributes before you add it using
1907 addfile(). If given, `arcname' specifies an alternative name for the
1908 file in the archive.
1909 """
1910 self._check("aw")
1911
1912 # When fileobj is given, replace name by
1913 # fileobj's real name.
1914 if fileobj is not None:
1915 name = fileobj.name
1916
1917 # Building the name of the member in the archive.
1918 # Backward slashes are converted to forward slashes,
1919 # Absolute paths are turned to relative paths.
1920 if arcname is None:
1921 arcname = name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001922 drv, arcname = os.path.splitdrive(arcname)
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00001923 arcname = arcname.replace(os.sep, "/")
1924 arcname = arcname.lstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001925
1926 # Now, fill the TarInfo object with
1927 # information specific for the file.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001928 tarinfo = self.tarinfo()
1929 tarinfo.tarfile = self
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001930
1931 # Use os.stat or os.lstat, depending on platform
1932 # and if symlinks shall be resolved.
1933 if fileobj is None:
1934 if hasattr(os, "lstat") and not self.dereference:
1935 statres = os.lstat(name)
1936 else:
1937 statres = os.stat(name)
1938 else:
1939 statres = os.fstat(fileobj.fileno())
1940 linkname = ""
1941
1942 stmd = statres.st_mode
1943 if stat.S_ISREG(stmd):
1944 inode = (statres.st_ino, statres.st_dev)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001945 if not self.dereference and statres.st_nlink > 1 and \
1946 inode in self.inodes and arcname != self.inodes[inode]:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001947 # Is it a hardlink to an already
1948 # archived file?
1949 type = LNKTYPE
1950 linkname = self.inodes[inode]
1951 else:
1952 # The inode is added only if its valid.
1953 # For win32 it is always 0.
1954 type = REGTYPE
1955 if inode[0]:
1956 self.inodes[inode] = arcname
1957 elif stat.S_ISDIR(stmd):
1958 type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001959 elif stat.S_ISFIFO(stmd):
1960 type = FIFOTYPE
1961 elif stat.S_ISLNK(stmd):
1962 type = SYMTYPE
1963 linkname = os.readlink(name)
1964 elif stat.S_ISCHR(stmd):
1965 type = CHRTYPE
1966 elif stat.S_ISBLK(stmd):
1967 type = BLKTYPE
1968 else:
1969 return None
1970
1971 # Fill the TarInfo object with all
1972 # information we can get.
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001973 tarinfo.name = arcname
1974 tarinfo.mode = stmd
1975 tarinfo.uid = statres.st_uid
1976 tarinfo.gid = statres.st_gid
Lars Gustäbel2470ff12010-06-03 10:11:52 +00001977 if type == REGTYPE:
Martin v. Löwis61d77e02004-08-20 06:35:46 +00001978 tarinfo.size = statres.st_size
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001979 else:
Guido van Rossume2a383d2007-01-15 16:59:06 +00001980 tarinfo.size = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001981 tarinfo.mtime = statres.st_mtime
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001982 tarinfo.type = type
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001983 tarinfo.linkname = linkname
1984 if pwd:
1985 try:
1986 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1987 except KeyError:
1988 pass
1989 if grp:
1990 try:
1991 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1992 except KeyError:
1993 pass
1994
1995 if type in (CHRTYPE, BLKTYPE):
1996 if hasattr(os, "major") and hasattr(os, "minor"):
1997 tarinfo.devmajor = os.major(statres.st_rdev)
1998 tarinfo.devminor = os.minor(statres.st_rdev)
1999 return tarinfo
2000
2001 def list(self, verbose=True):
2002 """Print a table of contents to sys.stdout. If `verbose' is False, only
2003 the names of the members are printed. If it is True, an `ls -l'-like
2004 output is produced.
2005 """
2006 self._check()
2007
2008 for tarinfo in self:
2009 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002010 print(filemode(tarinfo.mode), end=' ')
2011 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2012 tarinfo.gname or tarinfo.gid), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002013 if tarinfo.ischr() or tarinfo.isblk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002014 print("%10s" % ("%d,%d" \
2015 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002016 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002017 print("%10d" % tarinfo.size, end=' ')
2018 print("%d-%02d-%02d %02d:%02d:%02d" \
2019 % time.localtime(tarinfo.mtime)[:6], end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002020
Guido van Rossumd8faa362007-04-27 19:54:29 +00002021 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002022
2023 if verbose:
2024 if tarinfo.issym():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002025 print("->", tarinfo.linkname, end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002026 if tarinfo.islnk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002027 print("link to", tarinfo.linkname, end=' ')
2028 print()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002029
Raymond Hettingera63a3122011-01-26 20:34:14 +00002030 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002031 """Add the file `name' to the archive. `name' may be any type of file
2032 (directory, fifo, symbolic link, etc.). If given, `arcname'
2033 specifies an alternative name for the file in the archive.
2034 Directories are added recursively by default. This can be avoided by
Guido van Rossum486364b2007-06-30 05:01:58 +00002035 setting `recursive' to False. `exclude' is a function that should
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002036 return True for each filename to be excluded. `filter' is a function
2037 that expects a TarInfo object argument and returns the changed
2038 TarInfo object, if it returns None the TarInfo object will be
2039 excluded from the archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002040 """
2041 self._check("aw")
2042
2043 if arcname is None:
2044 arcname = name
2045
Guido van Rossum486364b2007-06-30 05:01:58 +00002046 # Exclude pathnames.
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002047 if exclude is not None:
2048 import warnings
2049 warnings.warn("use the filter argument instead",
2050 DeprecationWarning, 2)
2051 if exclude(name):
2052 self._dbg(2, "tarfile: Excluded %r" % name)
2053 return
Guido van Rossum486364b2007-06-30 05:01:58 +00002054
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002055 # Skip if somebody tries to archive the archive...
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002056 if self.name is not None and os.path.abspath(name) == self.name:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002057 self._dbg(2, "tarfile: Skipped %r" % name)
2058 return
2059
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002060 self._dbg(1, name)
2061
2062 # Create a TarInfo object from the file.
2063 tarinfo = self.gettarinfo(name, arcname)
2064
2065 if tarinfo is None:
2066 self._dbg(1, "tarfile: Unsupported type %r" % name)
2067 return
2068
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002069 # Change or exclude the TarInfo object.
2070 if filter is not None:
2071 tarinfo = filter(tarinfo)
2072 if tarinfo is None:
2073 self._dbg(2, "tarfile: Excluded %r" % name)
2074 return
2075
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002076 # Append the tar header and data to the archive.
2077 if tarinfo.isreg():
Guido van Rossume7ba4952007-06-06 23:52:48 +00002078 f = bltn_open(name, "rb")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002079 self.addfile(tarinfo, f)
2080 f.close()
2081
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002082 elif tarinfo.isdir():
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002083 self.addfile(tarinfo)
2084 if recursive:
2085 for f in os.listdir(name):
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002086 self.add(os.path.join(name, f), os.path.join(arcname, f),
Raymond Hettingera63a3122011-01-26 20:34:14 +00002087 recursive, exclude, filter=filter)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002088
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002089 else:
2090 self.addfile(tarinfo)
2091
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002092 def addfile(self, tarinfo, fileobj=None):
2093 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2094 given, tarinfo.size bytes are read from it and added to the archive.
2095 You can create TarInfo objects using gettarinfo().
2096 On Windows platforms, `fileobj' should always be opened with mode
2097 'rb' to avoid irritation about the file size.
2098 """
2099 self._check("aw")
2100
Thomas Wouters89f507f2006-12-13 04:49:30 +00002101 tarinfo = copy.copy(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002102
Guido van Rossume7ba4952007-06-06 23:52:48 +00002103 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002104 self.fileobj.write(buf)
2105 self.offset += len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002106
2107 # If there's data to follow, append it.
2108 if fileobj is not None:
2109 copyfileobj(fileobj, self.fileobj, tarinfo.size)
2110 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
2111 if remainder > 0:
2112 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2113 blocks += 1
2114 self.offset += blocks * BLOCKSIZE
2115
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002116 self.members.append(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002117
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002118 def extractall(self, path=".", members=None):
2119 """Extract all members from the archive to the current working
2120 directory and set owner, modification time and permissions on
2121 directories afterwards. `path' specifies a different directory
2122 to extract to. `members' is optional and must be a subset of the
2123 list returned by getmembers().
2124 """
2125 directories = []
2126
2127 if members is None:
2128 members = self
2129
2130 for tarinfo in members:
2131 if tarinfo.isdir():
Christian Heimes2202f872008-02-06 14:31:34 +00002132 # Extract directories with a safe mode.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002133 directories.append(tarinfo)
Christian Heimes2202f872008-02-06 14:31:34 +00002134 tarinfo = copy.copy(tarinfo)
2135 tarinfo.mode = 0o700
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002136 # Do not set_attrs directories, as we will do that further down
2137 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002138
2139 # Reverse sort directories.
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +00002140 directories.sort(key=lambda a: a.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002141 directories.reverse()
2142
2143 # Set correct owner, mtime and filemode on directories.
2144 for tarinfo in directories:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002145 dirpath = os.path.join(path, tarinfo.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002146 try:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002147 self.chown(tarinfo, dirpath)
2148 self.utime(tarinfo, dirpath)
2149 self.chmod(tarinfo, dirpath)
Guido van Rossumb940e112007-01-10 16:19:56 +00002150 except ExtractError as e:
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002151 if self.errorlevel > 1:
2152 raise
2153 else:
2154 self._dbg(1, "tarfile: %s" % e)
2155
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002156 def extract(self, member, path="", set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002157 """Extract a member from the archive to the current working directory,
2158 using its full name. Its file information is extracted as accurately
2159 as possible. `member' may be a filename or a TarInfo object. You can
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002160 specify a different directory using `path'. File attributes (owner,
2161 mtime, mode) are set unless `set_attrs' is False.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002162 """
2163 self._check("r")
2164
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002165 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002166 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002167 else:
2168 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002169
Neal Norwitza4f651a2004-07-20 22:07:44 +00002170 # Prepare the link target for makelink().
2171 if tarinfo.islnk():
2172 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2173
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002174 try:
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002175 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2176 set_attrs=set_attrs)
Guido van Rossumb940e112007-01-10 16:19:56 +00002177 except EnvironmentError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002178 if self.errorlevel > 0:
2179 raise
2180 else:
2181 if e.filename is None:
2182 self._dbg(1, "tarfile: %s" % e.strerror)
2183 else:
2184 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
Guido van Rossumb940e112007-01-10 16:19:56 +00002185 except ExtractError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002186 if self.errorlevel > 1:
2187 raise
2188 else:
2189 self._dbg(1, "tarfile: %s" % e)
2190
2191 def extractfile(self, member):
2192 """Extract a member from the archive as a file object. `member' may be
2193 a filename or a TarInfo object. If `member' is a regular file, a
2194 file-like object is returned. If `member' is a link, a file-like
2195 object is constructed from the link's target. If `member' is none of
2196 the above, None is returned.
2197 The file-like object is read-only and provides the following
2198 methods: read(), readline(), readlines(), seek() and tell()
2199 """
2200 self._check("r")
2201
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002202 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002203 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002204 else:
2205 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002206
2207 if tarinfo.isreg():
2208 return self.fileobject(self, tarinfo)
2209
2210 elif tarinfo.type not in SUPPORTED_TYPES:
2211 # If a member's type is unknown, it is treated as a
2212 # regular file.
2213 return self.fileobject(self, tarinfo)
2214
2215 elif tarinfo.islnk() or tarinfo.issym():
2216 if isinstance(self.fileobj, _Stream):
2217 # A small but ugly workaround for the case that someone tries
2218 # to extract a (sym)link as a file-object from a non-seekable
2219 # stream of tar blocks.
Thomas Wouters477c8d52006-05-27 19:21:47 +00002220 raise StreamError("cannot extract (sym)link as file object")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002221 else:
Georg Brandl7eb4b7d2005-07-22 21:49:32 +00002222 # A (sym)link's file object is its target's file object.
Lars Gustäbel1b512722010-06-03 12:45:16 +00002223 return self.extractfile(self._find_link_target(tarinfo))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002224 else:
2225 # If there's no data associated with the member (directory, chrdev,
2226 # blkdev, etc.), return None instead of a file object.
2227 return None
2228
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002229 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002230 """Extract the TarInfo object tarinfo to a physical
2231 file called targetpath.
2232 """
2233 # Fetch the TarInfo object for the given name
2234 # and build the destination pathname, replacing
2235 # forward slashes to platform specific separators.
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002236 targetpath = targetpath.rstrip("/")
2237 targetpath = targetpath.replace("/", os.sep)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002238
2239 # Create all upper directories.
2240 upperdirs = os.path.dirname(targetpath)
2241 if upperdirs and not os.path.exists(upperdirs):
Christian Heimes2202f872008-02-06 14:31:34 +00002242 # Create directories that are not part of the archive with
2243 # default permissions.
Thomas Woutersb2137042007-02-01 18:02:27 +00002244 os.makedirs(upperdirs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002245
2246 if tarinfo.islnk() or tarinfo.issym():
2247 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2248 else:
2249 self._dbg(1, tarinfo.name)
2250
2251 if tarinfo.isreg():
2252 self.makefile(tarinfo, targetpath)
2253 elif tarinfo.isdir():
2254 self.makedir(tarinfo, targetpath)
2255 elif tarinfo.isfifo():
2256 self.makefifo(tarinfo, targetpath)
2257 elif tarinfo.ischr() or tarinfo.isblk():
2258 self.makedev(tarinfo, targetpath)
2259 elif tarinfo.islnk() or tarinfo.issym():
2260 self.makelink(tarinfo, targetpath)
2261 elif tarinfo.type not in SUPPORTED_TYPES:
2262 self.makeunknown(tarinfo, targetpath)
2263 else:
2264 self.makefile(tarinfo, targetpath)
2265
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002266 if set_attrs:
2267 self.chown(tarinfo, targetpath)
2268 if not tarinfo.issym():
2269 self.chmod(tarinfo, targetpath)
2270 self.utime(tarinfo, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002271
2272 #--------------------------------------------------------------------------
2273 # Below are the different file methods. They are called via
2274 # _extract_member() when extract() is called. They can be replaced in a
2275 # subclass to implement other functionality.
2276
2277 def makedir(self, tarinfo, targetpath):
2278 """Make a directory called targetpath.
2279 """
2280 try:
Christian Heimes2202f872008-02-06 14:31:34 +00002281 # Use a safe mode for the directory, the real mode is set
2282 # later in _extract_member().
2283 os.mkdir(targetpath, 0o700)
Guido van Rossumb940e112007-01-10 16:19:56 +00002284 except EnvironmentError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002285 if e.errno != errno.EEXIST:
2286 raise
2287
2288 def makefile(self, tarinfo, targetpath):
2289 """Make a file called targetpath.
2290 """
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002291 source = self.fileobj
2292 source.seek(tarinfo.offset_data)
Guido van Rossume7ba4952007-06-06 23:52:48 +00002293 target = bltn_open(targetpath, "wb")
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002294 if tarinfo.sparse is not None:
2295 for offset, size in tarinfo.sparse:
2296 target.seek(offset)
2297 copyfileobj(source, target, size)
2298 else:
2299 copyfileobj(source, target, tarinfo.size)
2300 target.seek(tarinfo.size)
2301 target.truncate()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002302 target.close()
2303
2304 def makeunknown(self, tarinfo, targetpath):
2305 """Make a file from a TarInfo object with an unknown type
2306 at targetpath.
2307 """
2308 self.makefile(tarinfo, targetpath)
2309 self._dbg(1, "tarfile: Unknown file type %r, " \
2310 "extracted as regular file." % tarinfo.type)
2311
2312 def makefifo(self, tarinfo, targetpath):
2313 """Make a fifo called targetpath.
2314 """
2315 if hasattr(os, "mkfifo"):
2316 os.mkfifo(targetpath)
2317 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002318 raise ExtractError("fifo not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002319
2320 def makedev(self, tarinfo, targetpath):
2321 """Make a character or block device called targetpath.
2322 """
2323 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00002324 raise ExtractError("special devices not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002325
2326 mode = tarinfo.mode
2327 if tarinfo.isblk():
2328 mode |= stat.S_IFBLK
2329 else:
2330 mode |= stat.S_IFCHR
2331
2332 os.mknod(targetpath, mode,
2333 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2334
2335 def makelink(self, tarinfo, targetpath):
2336 """Make a (symbolic) link called targetpath. If it cannot be created
2337 (platform limitation), we try to make a copy of the referenced file
2338 instead of a link.
2339 """
Brian Curtind40e6f72010-07-08 21:39:08 +00002340 try:
Lars Gustäbel1b512722010-06-03 12:45:16 +00002341 # For systems that support symbolic and hard links.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002342 if tarinfo.issym():
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002343 os.symlink(tarinfo.linkname, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002344 else:
Neal Norwitza4f651a2004-07-20 22:07:44 +00002345 # See extract().
Lars Gustäbel1b512722010-06-03 12:45:16 +00002346 if os.path.exists(tarinfo._link_target):
2347 os.link(tarinfo._link_target, targetpath)
2348 else:
Brian Curtin82df53e2010-09-24 21:04:05 +00002349 self._extract_member(self._find_link_target(tarinfo),
2350 targetpath)
Brian Curtin16633fa2010-07-09 13:54:27 +00002351 except symlink_exception:
Brian Curtind40e6f72010-07-08 21:39:08 +00002352 if tarinfo.issym():
Brian Curtin16633fa2010-07-09 13:54:27 +00002353 linkpath = os.path.join(os.path.dirname(tarinfo.name),
2354 tarinfo.linkname)
Brian Curtind40e6f72010-07-08 21:39:08 +00002355 else:
2356 linkpath = tarinfo.linkname
Lars Gustäbel1b512722010-06-03 12:45:16 +00002357 else:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002358 try:
Brian Curtin16633fa2010-07-09 13:54:27 +00002359 self._extract_member(self._find_link_target(tarinfo),
2360 targetpath)
Lars Gustäbel1b512722010-06-03 12:45:16 +00002361 except KeyError:
2362 raise ExtractError("unable to resolve link inside archive")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002363
2364 def chown(self, tarinfo, targetpath):
2365 """Set owner of targetpath according to tarinfo.
2366 """
2367 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2368 # We have to be root to do so.
2369 try:
2370 g = grp.getgrnam(tarinfo.gname)[2]
2371 except KeyError:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002372 g = tarinfo.gid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002373 try:
2374 u = pwd.getpwnam(tarinfo.uname)[2]
2375 except KeyError:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002376 u = tarinfo.uid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002377 try:
2378 if tarinfo.issym() and hasattr(os, "lchown"):
2379 os.lchown(targetpath, u, g)
2380 else:
Andrew MacIntyre7970d202003-02-19 12:51:34 +00002381 if sys.platform != "os2emx":
2382 os.chown(targetpath, u, g)
Guido van Rossumb940e112007-01-10 16:19:56 +00002383 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002384 raise ExtractError("could not change owner")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002385
2386 def chmod(self, tarinfo, targetpath):
2387 """Set file permissions of targetpath according to tarinfo.
2388 """
Jack Jansen834eff62003-03-07 12:47:06 +00002389 if hasattr(os, 'chmod'):
2390 try:
2391 os.chmod(targetpath, tarinfo.mode)
Guido van Rossumb940e112007-01-10 16:19:56 +00002392 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002393 raise ExtractError("could not change mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002394
2395 def utime(self, tarinfo, targetpath):
2396 """Set modification time of targetpath according to tarinfo.
2397 """
Jack Jansen834eff62003-03-07 12:47:06 +00002398 if not hasattr(os, 'utime'):
Tim Petersf9347782003-03-07 15:36:41 +00002399 return
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002400 try:
2401 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
Guido van Rossumb940e112007-01-10 16:19:56 +00002402 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002403 raise ExtractError("could not change modification time")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002404
2405 #--------------------------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002406 def next(self):
2407 """Return the next member of the archive as a TarInfo object, when
2408 TarFile is opened for reading. Return None if there is no more
2409 available.
2410 """
2411 self._check("ra")
2412 if self.firstmember is not None:
2413 m = self.firstmember
2414 self.firstmember = None
2415 return m
2416
2417 # Read the next block.
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002418 self.fileobj.seek(self.offset)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002419 tarinfo = None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002420 while True:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002421 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00002422 tarinfo = self.tarinfo.fromtarfile(self)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002423 except EOFHeaderError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002424 if self.ignore_zeros:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002425 self._dbg(2, "0x%X: %s" % (self.offset, e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002426 self.offset += BLOCKSIZE
2427 continue
Lars Gustäbel9520a432009-11-22 18:48:49 +00002428 except InvalidHeaderError as e:
2429 if self.ignore_zeros:
2430 self._dbg(2, "0x%X: %s" % (self.offset, e))
2431 self.offset += BLOCKSIZE
2432 continue
2433 elif self.offset == 0:
2434 raise ReadError(str(e))
2435 except EmptyHeaderError:
2436 if self.offset == 0:
2437 raise ReadError("empty file")
2438 except TruncatedHeaderError as e:
2439 if self.offset == 0:
2440 raise ReadError(str(e))
2441 except SubsequentHeaderError as e:
2442 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002443 break
2444
Lars Gustäbel9520a432009-11-22 18:48:49 +00002445 if tarinfo is not None:
2446 self.members.append(tarinfo)
2447 else:
2448 self._loaded = True
2449
Thomas Wouters477c8d52006-05-27 19:21:47 +00002450 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002451
2452 #--------------------------------------------------------------------------
2453 # Little helper methods:
2454
Lars Gustäbel1b512722010-06-03 12:45:16 +00002455 def _getmember(self, name, tarinfo=None, normalize=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002456 """Find an archive member by name from bottom to top.
2457 If tarinfo is given, it is used as the starting point.
2458 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002459 # Ensure that all members have been loaded.
2460 members = self.getmembers()
2461
Lars Gustäbel1b512722010-06-03 12:45:16 +00002462 # Limit the member search list up to tarinfo.
2463 if tarinfo is not None:
2464 members = members[:members.index(tarinfo)]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002465
Lars Gustäbel1b512722010-06-03 12:45:16 +00002466 if normalize:
2467 name = os.path.normpath(name)
2468
2469 for member in reversed(members):
2470 if normalize:
2471 member_name = os.path.normpath(member.name)
2472 else:
2473 member_name = member.name
2474
2475 if name == member_name:
2476 return member
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002477
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002478 def _load(self):
2479 """Read through the entire archive file and look for readable
2480 members.
2481 """
2482 while True:
2483 tarinfo = self.next()
2484 if tarinfo is None:
2485 break
2486 self._loaded = True
2487
2488 def _check(self, mode=None):
2489 """Check if TarFile is still open, and if the operation's mode
2490 corresponds to TarFile's mode.
2491 """
2492 if self.closed:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002493 raise IOError("%s is closed" % self.__class__.__name__)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002494 if mode is not None and self.mode not in mode:
2495 raise IOError("bad operation for mode %r" % self.mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002496
Lars Gustäbel1b512722010-06-03 12:45:16 +00002497 def _find_link_target(self, tarinfo):
2498 """Find the target member of a symlink or hardlink member in the
2499 archive.
2500 """
2501 if tarinfo.issym():
2502 # Always search the entire archive.
2503 linkname = os.path.dirname(tarinfo.name) + "/" + tarinfo.linkname
2504 limit = None
2505 else:
2506 # Search the archive before the link, because a hard link is
2507 # just a reference to an already archived file.
2508 linkname = tarinfo.linkname
2509 limit = tarinfo
2510
2511 member = self._getmember(linkname, tarinfo=limit, normalize=True)
2512 if member is None:
2513 raise KeyError("linkname %r not found" % linkname)
2514 return member
2515
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002516 def __iter__(self):
2517 """Provide an iterator object.
2518 """
2519 if self._loaded:
2520 return iter(self.members)
2521 else:
2522 return TarIter(self)
2523
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002524 def _dbg(self, level, msg):
2525 """Write debugging output to sys.stderr.
2526 """
2527 if level <= self.debug:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002528 print(msg, file=sys.stderr)
Lars Gustäbel01385812010-03-03 12:08:54 +00002529
2530 def __enter__(self):
2531 self._check()
2532 return self
2533
2534 def __exit__(self, type, value, traceback):
2535 if type is None:
2536 self.close()
2537 else:
2538 # An exception occurred. We must not call close() because
2539 # it would try to write end-of-archive blocks and padding.
2540 if not self._extfileobj:
2541 self.fileobj.close()
2542 self.closed = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002543# class TarFile
2544
2545class TarIter:
2546 """Iterator Class.
2547
2548 for tarinfo in TarFile(...):
2549 suite...
2550 """
2551
2552 def __init__(self, tarfile):
2553 """Construct a TarIter object.
2554 """
2555 self.tarfile = tarfile
Martin v. Löwis637431b2005-03-03 23:12:42 +00002556 self.index = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002557 def __iter__(self):
2558 """Return iterator object.
2559 """
2560 return self
Georg Brandla18af4e2007-04-21 15:47:16 +00002561 def __next__(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002562 """Return the next item using TarFile's next() method.
2563 When all members have been read, set TarFile as _loaded.
2564 """
Martin v. Löwis637431b2005-03-03 23:12:42 +00002565 # Fix for SF #1100429: Under rare circumstances it can
2566 # happen that getmembers() is called during iteration,
2567 # which will cause TarIter to stop prematurely.
2568 if not self.tarfile._loaded:
2569 tarinfo = self.tarfile.next()
2570 if not tarinfo:
2571 self.tarfile._loaded = True
2572 raise StopIteration
2573 else:
2574 try:
2575 tarinfo = self.tarfile.members[self.index]
2576 except IndexError:
2577 raise StopIteration
2578 self.index += 1
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002579 return tarinfo
2580
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002581#--------------------
2582# exported functions
2583#--------------------
2584def is_tarfile(name):
2585 """Return True if name points to a tar archive that we
2586 are able to handle, else return False.
2587 """
2588 try:
2589 t = open(name)
2590 t.close()
2591 return True
2592 except TarError:
2593 return False
2594
Guido van Rossume7ba4952007-06-06 23:52:48 +00002595bltn_open = open
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002596open = TarFile.open