blob: 54d0e0e173f444c51997a8d7579bf1e692f2de4a [file] [log] [blame]
Benjamin Peterson90f5ba52010-03-11 22:53:45 +00001#!/usr/bin/env python3
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
Christian Heimes9c1257e2007-11-04 11:37:22 +00005# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00006# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
32__version__ = "$Revision$"
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000033
Guido van Rossumd8faa362007-04-27 19:54:29 +000034version = "0.9.0"
Guido van Rossum98297ee2007-11-06 21:34:58 +000035__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)"
Senthil Kumaran7c9719c2011-07-28 22:32:49 +080036__date__ = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $"
37__cvsid__ = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $"
Guido van Rossum98297ee2007-11-06 21:34:58 +000038__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000039
40#---------
41# Imports
42#---------
43import sys
44import os
45import shutil
46import stat
47import errno
48import time
49import struct
Thomas Wouters89f507f2006-12-13 04:49:30 +000050import copy
Guido van Rossumd8faa362007-04-27 19:54:29 +000051import re
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000052
53try:
54 import grp, pwd
55except ImportError:
56 grp = pwd = None
57
Brian Curtin16633fa2010-07-09 13:54:27 +000058# os.symlink on Windows prior to 6.0 raises NotImplementedError
59symlink_exception = (AttributeError, NotImplementedError)
60try:
61 # WindowsError (1314) will be raised if the caller does not hold the
62 # SeCreateSymbolicLinkPrivilege privilege
63 symlink_exception += (WindowsError,)
64except NameError:
65 pass
66
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000067# from tarfile import *
68__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
69
Georg Brandl1a3284e2007-12-02 09:40:06 +000070from builtins import open as _open # Since 'open' is TarFile.open
Guido van Rossum8f78fe92006-08-24 04:03:53 +000071
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000072#---------------------------------------------------------
73# tar constants
74#---------------------------------------------------------
Lars Gustäbelb506dc32007-08-07 18:36:16 +000075NUL = b"\0" # the null character
Guido van Rossumd8faa362007-04-27 19:54:29 +000076BLOCKSIZE = 512 # length of processing blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000077RECORDSIZE = BLOCKSIZE * 20 # length of records
Lars Gustäbelb506dc32007-08-07 18:36:16 +000078GNU_MAGIC = b"ustar \0" # magic gnu tar string
79POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000080
Guido van Rossumd8faa362007-04-27 19:54:29 +000081LENGTH_NAME = 100 # maximum length of a filename
82LENGTH_LINK = 100 # maximum length of a linkname
83LENGTH_PREFIX = 155 # maximum length of the prefix field
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000084
Lars Gustäbelb506dc32007-08-07 18:36:16 +000085REGTYPE = b"0" # regular file
86AREGTYPE = b"\0" # regular file
87LNKTYPE = b"1" # link (inside tarfile)
88SYMTYPE = b"2" # symbolic link
89CHRTYPE = b"3" # character special device
90BLKTYPE = b"4" # block special device
91DIRTYPE = b"5" # directory
92FIFOTYPE = b"6" # fifo special device
93CONTTYPE = b"7" # contiguous file
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000094
Lars Gustäbelb506dc32007-08-07 18:36:16 +000095GNUTYPE_LONGNAME = b"L" # GNU tar longname
96GNUTYPE_LONGLINK = b"K" # GNU tar longlink
97GNUTYPE_SPARSE = b"S" # GNU tar sparse file
Guido van Rossumd8faa362007-04-27 19:54:29 +000098
Lars Gustäbelb506dc32007-08-07 18:36:16 +000099XHDTYPE = b"x" # POSIX.1-2001 extended header
100XGLTYPE = b"g" # POSIX.1-2001 global header
101SOLARIS_XHDTYPE = b"X" # Solaris extended header
Guido van Rossumd8faa362007-04-27 19:54:29 +0000102
103USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
104GNU_FORMAT = 1 # GNU tar format
105PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
106DEFAULT_FORMAT = GNU_FORMAT
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000107
108#---------------------------------------------------------
109# tarfile constants
110#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000111# File types that tarfile supports:
112SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
113 SYMTYPE, DIRTYPE, FIFOTYPE,
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000114 CONTTYPE, CHRTYPE, BLKTYPE,
115 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
116 GNUTYPE_SPARSE)
117
Guido van Rossumd8faa362007-04-27 19:54:29 +0000118# File types that will be treated as a regular file.
119REGULAR_TYPES = (REGTYPE, AREGTYPE,
120 CONTTYPE, GNUTYPE_SPARSE)
121
122# File types that are part of the GNU tar format.
123GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
124 GNUTYPE_SPARSE)
125
126# Fields from a pax header that override a TarInfo attribute.
127PAX_FIELDS = ("path", "linkpath", "size", "mtime",
128 "uid", "gid", "uname", "gname")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000129
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000130# Fields from a pax header that are affected by hdrcharset.
131PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
132
Guido van Rossume7ba4952007-06-06 23:52:48 +0000133# Fields in a pax header that are numbers, all other fields
134# are treated as strings.
135PAX_NUMBER_FIELDS = {
136 "atime": float,
137 "ctime": float,
138 "mtime": float,
139 "uid": int,
140 "gid": int,
141 "size": int
142}
143
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000144#---------------------------------------------------------
145# Bits used in the mode field, values in octal.
146#---------------------------------------------------------
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000147S_IFLNK = 0o120000 # symbolic link
148S_IFREG = 0o100000 # regular file
149S_IFBLK = 0o060000 # block device
150S_IFDIR = 0o040000 # directory
151S_IFCHR = 0o020000 # character device
152S_IFIFO = 0o010000 # fifo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000153
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000154TSUID = 0o4000 # set UID on execution
155TSGID = 0o2000 # set GID on execution
156TSVTX = 0o1000 # reserved
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000157
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000158TUREAD = 0o400 # read by owner
159TUWRITE = 0o200 # write by owner
160TUEXEC = 0o100 # execute/search by owner
161TGREAD = 0o040 # read by group
162TGWRITE = 0o020 # write by group
163TGEXEC = 0o010 # execute/search by group
164TOREAD = 0o004 # read by other
165TOWRITE = 0o002 # write by other
166TOEXEC = 0o001 # execute/search by other
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000167
168#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000169# initialization
170#---------------------------------------------------------
Victor Stinner0f35e2c2010-06-11 23:46:47 +0000171if os.name in ("nt", "ce"):
172 ENCODING = "utf-8"
173else:
174 ENCODING = sys.getfilesystemencoding()
Guido van Rossumd8faa362007-04-27 19:54:29 +0000175
176#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000177# Some useful functions
178#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000179
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000180def stn(s, length, encoding, errors):
181 """Convert a string to a null-terminated bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000182 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000183 s = s.encode(encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000184 return s[:length] + (length - len(s)) * NUL
Thomas Wouters477c8d52006-05-27 19:21:47 +0000185
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000186def nts(s, encoding, errors):
187 """Convert a null-terminated bytes object to a string.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000188 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000189 p = s.find(b"\0")
190 if p != -1:
191 s = s[:p]
192 return s.decode(encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000193
Thomas Wouters477c8d52006-05-27 19:21:47 +0000194def nti(s):
195 """Convert a number field to a python number.
196 """
197 # There are two possible encodings for a number field, see
198 # itn() below.
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200199 if s[0] in (0o200, 0o377):
200 n = 0
201 for i in range(len(s) - 1):
202 n <<= 8
203 n += s[i + 1]
204 if s[0] == 0o377:
205 n = -(256 ** (len(s) - 1) - n)
206 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000207 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000208 n = int(nts(s, "ascii", "strict") or "0", 8)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000209 except ValueError:
Lars Gustäbel9520a432009-11-22 18:48:49 +0000210 raise InvalidHeaderError("invalid header")
Thomas Wouters477c8d52006-05-27 19:21:47 +0000211 return n
212
Guido van Rossumd8faa362007-04-27 19:54:29 +0000213def itn(n, digits=8, format=DEFAULT_FORMAT):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000214 """Convert a python number to a number field.
215 """
216 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
217 # octal digits followed by a null-byte, this allows values up to
218 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200219 # that if necessary. A leading 0o200 or 0o377 byte indicate this
220 # particular encoding, the following digits-1 bytes are a big-endian
221 # base-256 representation. This allows values up to (256**(digits-1))-1.
222 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
223 # number.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224 if 0 <= n < 8 ** (digits - 1):
Lars Gustäbela280ca752007-08-28 07:34:33 +0000225 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200226 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
227 if n >= 0:
228 s = bytearray([0o200])
229 else:
230 s = bytearray([0o377])
231 n = 256 ** digits + n
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232
Guido van Rossum805365e2007-05-07 22:24:25 +0000233 for i in range(digits - 1):
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200234 s.insert(1, n & 0o377)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000235 n >>= 8
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200236 else:
237 raise ValueError("overflow in number field")
238
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239 return s
240
241def calc_chksums(buf):
242 """Calculate the checksum for a member's header by summing up all
243 characters except for the chksum field which is treated as if
244 it was filled with spaces. According to the GNU tar sources,
245 some tars (Sun and NeXT) calculate chksum with signed char,
246 which will be different if there are chars in the buffer with
247 the high bit set. So we calculate two checksums, unsigned and
248 signed.
249 """
250 unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
251 signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
252 return unsigned_chksum, signed_chksum
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000253
254def copyfileobj(src, dst, length=None):
255 """Copy length bytes from fileobj src to fileobj dst.
256 If length is None, copy the entire content.
257 """
258 if length == 0:
259 return
260 if length is None:
261 shutil.copyfileobj(src, dst)
262 return
263
264 BUFSIZE = 16 * 1024
265 blocks, remainder = divmod(length, BUFSIZE)
Guido van Rossum805365e2007-05-07 22:24:25 +0000266 for b in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000267 buf = src.read(BUFSIZE)
268 if len(buf) < BUFSIZE:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000270 dst.write(buf)
271
272 if remainder != 0:
273 buf = src.read(remainder)
274 if len(buf) < remainder:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000275 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000276 dst.write(buf)
277 return
278
279filemode_table = (
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000280 ((S_IFLNK, "l"),
281 (S_IFREG, "-"),
282 (S_IFBLK, "b"),
283 (S_IFDIR, "d"),
284 (S_IFCHR, "c"),
285 (S_IFIFO, "p")),
286
287 ((TUREAD, "r"),),
288 ((TUWRITE, "w"),),
289 ((TUEXEC|TSUID, "s"),
290 (TSUID, "S"),
291 (TUEXEC, "x")),
292
293 ((TGREAD, "r"),),
294 ((TGWRITE, "w"),),
295 ((TGEXEC|TSGID, "s"),
296 (TSGID, "S"),
297 (TGEXEC, "x")),
298
299 ((TOREAD, "r"),),
300 ((TOWRITE, "w"),),
301 ((TOEXEC|TSVTX, "t"),
302 (TSVTX, "T"),
303 (TOEXEC, "x"))
304)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000305
306def filemode(mode):
307 """Convert a file's mode to a string of the form
308 -rwxrwxrwx.
309 Used by TarFile.list()
310 """
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000311 perm = []
312 for table in filemode_table:
313 for bit, char in table:
314 if mode & bit == bit:
315 perm.append(char)
316 break
317 else:
318 perm.append("-")
319 return "".join(perm)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000320
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000321class TarError(Exception):
322 """Base exception."""
323 pass
324class ExtractError(TarError):
325 """General exception for extract errors."""
326 pass
327class ReadError(TarError):
328 """Exception for unreadble tar archives."""
329 pass
330class CompressionError(TarError):
331 """Exception for unavailable compression methods."""
332 pass
333class StreamError(TarError):
334 """Exception for unsupported operations on stream-like TarFiles."""
335 pass
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000336class HeaderError(TarError):
Lars Gustäbel9520a432009-11-22 18:48:49 +0000337 """Base exception for header errors."""
338 pass
339class EmptyHeaderError(HeaderError):
340 """Exception for empty headers."""
341 pass
342class TruncatedHeaderError(HeaderError):
343 """Exception for truncated headers."""
344 pass
345class EOFHeaderError(HeaderError):
346 """Exception for end of file headers."""
347 pass
348class InvalidHeaderError(HeaderError):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000349 """Exception for invalid headers."""
350 pass
Lars Gustäbel9520a432009-11-22 18:48:49 +0000351class SubsequentHeaderError(HeaderError):
352 """Exception for missing and invalid extended headers."""
353 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000354
355#---------------------------
356# internal stream interface
357#---------------------------
358class _LowLevelFile:
359 """Low-level file object. Supports reading and writing.
360 It is used instead of a regular file object for streaming
361 access.
362 """
363
364 def __init__(self, name, mode):
365 mode = {
366 "r": os.O_RDONLY,
367 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
368 }[mode]
369 if hasattr(os, "O_BINARY"):
370 mode |= os.O_BINARY
Lars Gustäbeld6eb70b2010-04-29 15:37:02 +0000371 self.fd = os.open(name, mode, 0o666)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000372
373 def close(self):
374 os.close(self.fd)
375
376 def read(self, size):
377 return os.read(self.fd, size)
378
379 def write(self, s):
380 os.write(self.fd, s)
381
382class _Stream:
383 """Class that serves as an adapter between TarFile and
384 a stream-like object. The stream-like object only
385 needs to have a read() or write() method and is accessed
386 blockwise. Use of gzip or bzip2 compression is possible.
387 A stream-like object could be for example: sys.stdin,
388 sys.stdout, a socket, a tape device etc.
389
390 _Stream is intended to be used only internally.
391 """
392
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000393 def __init__(self, name, mode, comptype, fileobj, bufsize):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000394 """Construct a _Stream object.
395 """
396 self._extfileobj = True
397 if fileobj is None:
398 fileobj = _LowLevelFile(name, mode)
399 self._extfileobj = False
400
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000401 if comptype == '*':
402 # Enable transparent compression detection for the
403 # stream interface
404 fileobj = _StreamProxy(fileobj)
405 comptype = fileobj.getcomptype()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000406
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000407 self.name = name or ""
408 self.mode = mode
409 self.comptype = comptype
410 self.fileobj = fileobj
411 self.bufsize = bufsize
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000412 self.buf = b""
Guido van Rossume2a383d2007-01-15 16:59:06 +0000413 self.pos = 0
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000414 self.closed = False
415
Antoine Pitrou605c2932010-09-23 20:15:14 +0000416 try:
417 if comptype == "gz":
418 try:
419 import zlib
420 except ImportError:
421 raise CompressionError("zlib module is not available")
422 self.zlib = zlib
423 self.crc = zlib.crc32(b"")
424 if mode == "r":
425 self._init_read_gz()
426 else:
427 self._init_write_gz()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000428
Antoine Pitrou605c2932010-09-23 20:15:14 +0000429 if comptype == "bz2":
430 try:
431 import bz2
432 except ImportError:
433 raise CompressionError("bz2 module is not available")
434 if mode == "r":
435 self.dbuf = b""
436 self.cmp = bz2.BZ2Decompressor()
437 else:
438 self.cmp = bz2.BZ2Compressor()
439 except:
440 if not self._extfileobj:
441 self.fileobj.close()
442 self.closed = True
443 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000444
445 def __del__(self):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000446 if hasattr(self, "closed") and not self.closed:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000447 self.close()
448
449 def _init_write_gz(self):
450 """Initialize for writing with gzip compression.
451 """
452 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
453 -self.zlib.MAX_WBITS,
454 self.zlib.DEF_MEM_LEVEL,
455 0)
Guido van Rossume2a383d2007-01-15 16:59:06 +0000456 timestamp = struct.pack("<L", int(time.time()))
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000457 self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000458 if self.name.endswith(".gz"):
459 self.name = self.name[:-3]
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000460 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
461 self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000462
463 def write(self, s):
464 """Write string s to the stream.
465 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000466 if self.comptype == "gz":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000467 self.crc = self.zlib.crc32(s, self.crc)
468 self.pos += len(s)
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000469 if self.comptype != "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000470 s = self.cmp.compress(s)
471 self.__write(s)
472
473 def __write(self, s):
474 """Write string s to the stream if a whole new block
475 is ready to be written.
476 """
477 self.buf += s
478 while len(self.buf) > self.bufsize:
479 self.fileobj.write(self.buf[:self.bufsize])
480 self.buf = self.buf[self.bufsize:]
481
482 def close(self):
483 """Close the _Stream object. No operation should be
484 done on it afterwards.
485 """
486 if self.closed:
487 return
488
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000489 if self.mode == "w" and self.comptype != "tar":
Martin v. Löwisc234a522004-08-22 21:28:33 +0000490 self.buf += self.cmp.flush()
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000491
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000492 if self.mode == "w" and self.buf:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000493 self.fileobj.write(self.buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000494 self.buf = b""
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000495 if self.comptype == "gz":
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000496 # The native zlib crc is an unsigned 32-bit integer, but
497 # the Python wrapper implicitly casts that to a signed C
498 # long. So, on a 32-bit box self.crc may "look negative",
499 # while the same crc on a 64-bit box may "look positive".
500 # To avoid irksome warnings from the `struct` module, force
501 # it to look positive on all boxes.
Guido van Rossume2a383d2007-01-15 16:59:06 +0000502 self.fileobj.write(struct.pack("<L", self.crc & 0xffffffff))
503 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000504
505 if not self._extfileobj:
506 self.fileobj.close()
507
508 self.closed = True
509
510 def _init_read_gz(self):
511 """Initialize for reading a gzip compressed fileobj.
512 """
513 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000514 self.dbuf = b""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000515
516 # taken from gzip.GzipFile with some alterations
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000517 if self.__read(2) != b"\037\213":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000518 raise ReadError("not a gzip file")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000519 if self.__read(1) != b"\010":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000520 raise CompressionError("unsupported compression method")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000521
522 flag = ord(self.__read(1))
523 self.__read(6)
524
525 if flag & 4:
526 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
527 self.read(xlen)
528 if flag & 8:
529 while True:
530 s = self.__read(1)
531 if not s or s == NUL:
532 break
533 if flag & 16:
534 while True:
535 s = self.__read(1)
536 if not s or s == NUL:
537 break
538 if flag & 2:
539 self.__read(2)
540
541 def tell(self):
542 """Return the stream's file pointer position.
543 """
544 return self.pos
545
546 def seek(self, pos=0):
547 """Set the stream's file pointer to pos. Negative seeking
548 is forbidden.
549 """
550 if pos - self.pos >= 0:
551 blocks, remainder = divmod(pos - self.pos, self.bufsize)
Guido van Rossum805365e2007-05-07 22:24:25 +0000552 for i in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000553 self.read(self.bufsize)
554 self.read(remainder)
555 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556 raise StreamError("seeking backwards is not allowed")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000557 return self.pos
558
559 def read(self, size=None):
560 """Return the next size number of bytes from the stream.
561 If size is not defined, return all bytes of the stream
562 up to EOF.
563 """
564 if size is None:
565 t = []
566 while True:
567 buf = self._read(self.bufsize)
568 if not buf:
569 break
570 t.append(buf)
571 buf = "".join(t)
572 else:
573 buf = self._read(size)
574 self.pos += len(buf)
575 return buf
576
577 def _read(self, size):
578 """Return size bytes from the stream.
579 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000580 if self.comptype == "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000581 return self.__read(size)
582
583 c = len(self.dbuf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000584 while c < size:
585 buf = self.__read(self.bufsize)
586 if not buf:
587 break
Guido van Rossumd8faa362007-04-27 19:54:29 +0000588 try:
589 buf = self.cmp.decompress(buf)
590 except IOError:
591 raise ReadError("invalid compressed data")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000592 self.dbuf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000593 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000594 buf = self.dbuf[:size]
595 self.dbuf = self.dbuf[size:]
596 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000597
598 def __read(self, size):
599 """Return size bytes from stream. If internal buffer is empty,
600 read another block from the stream.
601 """
602 c = len(self.buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000603 while c < size:
604 buf = self.fileobj.read(self.bufsize)
605 if not buf:
606 break
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000607 self.buf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000608 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000609 buf = self.buf[:size]
610 self.buf = self.buf[size:]
611 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000612# class _Stream
613
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000614class _StreamProxy(object):
615 """Small proxy class that enables transparent compression
616 detection for the Stream interface (mode 'r|*').
617 """
618
619 def __init__(self, fileobj):
620 self.fileobj = fileobj
621 self.buf = self.fileobj.read(BLOCKSIZE)
622
623 def read(self, size):
624 self.read = self.fileobj.read
625 return self.buf
626
627 def getcomptype(self):
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000628 if self.buf.startswith(b"\037\213\010"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000629 return "gz"
Lars Gustäbeled1ac582011-12-06 12:56:38 +0100630 if self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000631 return "bz2"
632 return "tar"
633
634 def close(self):
635 self.fileobj.close()
636# class StreamProxy
637
Thomas Wouters477c8d52006-05-27 19:21:47 +0000638class _BZ2Proxy(object):
639 """Small proxy class that enables external file object
640 support for "r:bz2" and "w:bz2" modes. This is actually
641 a workaround for a limitation in bz2 module's BZ2File
642 class which (unlike gzip.GzipFile) has no support for
643 a file object argument.
644 """
645
646 blocksize = 16 * 1024
647
648 def __init__(self, fileobj, mode):
649 self.fileobj = fileobj
650 self.mode = mode
Guido van Rossumd8faa362007-04-27 19:54:29 +0000651 self.name = getattr(self.fileobj, "name", None)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000652 self.init()
653
654 def init(self):
655 import bz2
656 self.pos = 0
657 if self.mode == "r":
658 self.bz2obj = bz2.BZ2Decompressor()
659 self.fileobj.seek(0)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000660 self.buf = b""
Thomas Wouters477c8d52006-05-27 19:21:47 +0000661 else:
662 self.bz2obj = bz2.BZ2Compressor()
663
664 def read(self, size):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000665 x = len(self.buf)
666 while x < size:
Lars Gustäbel42e00912009-03-22 20:34:29 +0000667 raw = self.fileobj.read(self.blocksize)
668 if not raw:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000669 break
Lars Gustäbel42e00912009-03-22 20:34:29 +0000670 data = self.bz2obj.decompress(raw)
671 self.buf += data
Thomas Wouters477c8d52006-05-27 19:21:47 +0000672 x += len(data)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000673
674 buf = self.buf[:size]
675 self.buf = self.buf[size:]
676 self.pos += len(buf)
677 return buf
678
679 def seek(self, pos):
680 if pos < self.pos:
681 self.init()
682 self.read(pos - self.pos)
683
684 def tell(self):
685 return self.pos
686
687 def write(self, data):
688 self.pos += len(data)
689 raw = self.bz2obj.compress(data)
690 self.fileobj.write(raw)
691
692 def close(self):
693 if self.mode == "w":
694 raw = self.bz2obj.flush()
695 self.fileobj.write(raw)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000696# class _BZ2Proxy
697
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000698#------------------------
699# Extraction file object
700#------------------------
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000701class _FileInFile(object):
702 """A thin wrapper around an existing file object that
703 provides a part of its data as an individual file
704 object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000705 """
706
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000707 def __init__(self, fileobj, offset, size, blockinfo=None):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000708 self.fileobj = fileobj
709 self.offset = offset
710 self.size = size
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000711 self.position = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000712
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000713 if blockinfo is None:
714 blockinfo = [(0, size)]
715
716 # Construct a map with data and zero blocks.
717 self.map_index = 0
718 self.map = []
719 lastpos = 0
720 realpos = self.offset
721 for offset, size in blockinfo:
722 if offset > lastpos:
723 self.map.append((False, lastpos, offset, None))
724 self.map.append((True, offset, offset + size, realpos))
725 realpos += size
726 lastpos = offset + size
727 if lastpos < self.size:
728 self.map.append((False, lastpos, self.size, None))
729
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000730 def seekable(self):
731 if not hasattr(self.fileobj, "seekable"):
732 # XXX gzip.GzipFile and bz2.BZ2File
733 return True
734 return self.fileobj.seekable()
735
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000736 def tell(self):
737 """Return the current file position.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000738 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000739 return self.position
740
741 def seek(self, position):
742 """Seek to a position in the file.
743 """
744 self.position = position
745
746 def read(self, size=None):
747 """Read data from the file.
748 """
749 if size is None:
750 size = self.size - self.position
751 else:
752 size = min(size, self.size - self.position)
753
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000754 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000755 while size > 0:
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000756 while True:
757 data, start, stop, offset = self.map[self.map_index]
758 if start <= self.position < stop:
759 break
760 else:
761 self.map_index += 1
762 if self.map_index == len(self.map):
763 self.map_index = 0
764 length = min(size, stop - self.position)
765 if data:
Lars Gustäbel9f6cbe02011-02-23 11:52:31 +0000766 self.fileobj.seek(offset + (self.position - start))
767 buf += self.fileobj.read(length)
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000768 else:
769 buf += NUL * length
770 size -= length
771 self.position += length
772 return buf
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000773#class _FileInFile
774
775
776class ExFileObject(object):
777 """File-like object for reading an archive member.
778 Is returned by TarFile.extractfile().
779 """
780 blocksize = 1024
781
782 def __init__(self, tarfile, tarinfo):
783 self.fileobj = _FileInFile(tarfile.fileobj,
784 tarinfo.offset_data,
785 tarinfo.size,
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000786 tarinfo.sparse)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000787 self.name = tarinfo.name
788 self.mode = "r"
789 self.closed = False
790 self.size = tarinfo.size
791
792 self.position = 0
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000793 self.buffer = b""
794
795 def readable(self):
796 return True
797
798 def writable(self):
799 return False
800
801 def seekable(self):
802 return self.fileobj.seekable()
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000803
804 def read(self, size=None):
805 """Read at most size bytes from the file. If size is not
806 present or None, read all data until EOF is reached.
807 """
808 if self.closed:
809 raise ValueError("I/O operation on closed file")
810
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000811 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000812 if self.buffer:
813 if size is None:
814 buf = self.buffer
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000815 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000816 else:
817 buf = self.buffer[:size]
818 self.buffer = self.buffer[size:]
819
820 if size is None:
821 buf += self.fileobj.read()
822 else:
823 buf += self.fileobj.read(size - len(buf))
824
825 self.position += len(buf)
826 return buf
827
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000828 # XXX TextIOWrapper uses the read1() method.
829 read1 = read
830
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000831 def readline(self, size=-1):
832 """Read one entire line from the file. If size is present
833 and non-negative, return a string with at most that
834 size, which may be an incomplete line.
835 """
836 if self.closed:
837 raise ValueError("I/O operation on closed file")
838
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000839 pos = self.buffer.find(b"\n") + 1
840 if pos == 0:
841 # no newline found.
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000842 while True:
843 buf = self.fileobj.read(self.blocksize)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000844 self.buffer += buf
845 if not buf or b"\n" in buf:
846 pos = self.buffer.find(b"\n") + 1
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000847 if pos == 0:
848 # no newline found.
849 pos = len(self.buffer)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000850 break
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000851
852 if size != -1:
853 pos = min(size, pos)
854
855 buf = self.buffer[:pos]
856 self.buffer = self.buffer[pos:]
857 self.position += len(buf)
858 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000859
860 def readlines(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000861 """Return a list with all remaining lines.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000862 """
863 result = []
864 while True:
865 line = self.readline()
866 if not line: break
867 result.append(line)
868 return result
869
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000870 def tell(self):
871 """Return the current file position.
872 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000873 if self.closed:
874 raise ValueError("I/O operation on closed file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000875
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000876 return self.position
877
878 def seek(self, pos, whence=os.SEEK_SET):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000879 """Seek to a position in the file.
880 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000881 if self.closed:
882 raise ValueError("I/O operation on closed file")
883
884 if whence == os.SEEK_SET:
885 self.position = min(max(pos, 0), self.size)
886 elif whence == os.SEEK_CUR:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000887 if pos < 0:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000888 self.position = max(self.position + pos, 0)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000889 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000890 self.position = min(self.position + pos, self.size)
891 elif whence == os.SEEK_END:
892 self.position = max(min(self.size + pos, self.size), 0)
893 else:
894 raise ValueError("Invalid argument")
895
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000896 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000897 self.fileobj.seek(self.position)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000898
899 def close(self):
900 """Close the file object.
901 """
902 self.closed = True
Martin v. Löwisdf241532005-03-03 08:17:42 +0000903
904 def __iter__(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000905 """Get an iterator over the file's lines.
Martin v. Löwisdf241532005-03-03 08:17:42 +0000906 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000907 while True:
908 line = self.readline()
909 if not line:
910 break
911 yield line
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000912#class ExFileObject
913
914#------------------
915# Exported Classes
916#------------------
917class TarInfo(object):
918 """Informational class which holds the details about an
919 archive member given by a tar header block.
920 TarInfo objects are returned by TarFile.getmember(),
921 TarFile.getmembers() and TarFile.gettarinfo() and are
922 usually created internally.
923 """
924
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000925 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
926 "chksum", "type", "linkname", "uname", "gname",
927 "devmajor", "devminor",
928 "offset", "offset_data", "pax_headers", "sparse",
929 "tarfile", "_sparse_structs", "_link_target")
930
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000931 def __init__(self, name=""):
932 """Construct a TarInfo object. name is the optional name
933 of the member.
934 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000935 self.name = name # member name
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000936 self.mode = 0o644 # file permissions
Thomas Wouters477c8d52006-05-27 19:21:47 +0000937 self.uid = 0 # user id
938 self.gid = 0 # group id
939 self.size = 0 # file size
940 self.mtime = 0 # modification time
941 self.chksum = 0 # header checksum
942 self.type = REGTYPE # member type
943 self.linkname = "" # link name
Lars Gustäbel331b8002010-10-04 15:18:47 +0000944 self.uname = "" # user name
945 self.gname = "" # group name
Thomas Wouters477c8d52006-05-27 19:21:47 +0000946 self.devmajor = 0 # device major number
947 self.devminor = 0 # device minor number
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000948
Thomas Wouters477c8d52006-05-27 19:21:47 +0000949 self.offset = 0 # the tar header starts here
950 self.offset_data = 0 # the file's data starts here
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000951
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000952 self.sparse = None # sparse member information
Guido van Rossumd8faa362007-04-27 19:54:29 +0000953 self.pax_headers = {} # pax header information
954
955 # In pax headers the "name" and "linkname" field are called
956 # "path" and "linkpath".
957 def _getpath(self):
958 return self.name
959 def _setpath(self, name):
960 self.name = name
961 path = property(_getpath, _setpath)
962
963 def _getlinkpath(self):
964 return self.linkname
965 def _setlinkpath(self, linkname):
966 self.linkname = linkname
967 linkpath = property(_getlinkpath, _setlinkpath)
968
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000969 def __repr__(self):
970 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
971
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000972 def get_info(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000973 """Return the TarInfo's attributes as a dictionary.
974 """
975 info = {
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000976 "name": self.name,
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000977 "mode": self.mode & 0o7777,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000978 "uid": self.uid,
979 "gid": self.gid,
980 "size": self.size,
981 "mtime": self.mtime,
982 "chksum": self.chksum,
983 "type": self.type,
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000984 "linkname": self.linkname,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000985 "uname": self.uname,
986 "gname": self.gname,
987 "devmajor": self.devmajor,
988 "devminor": self.devminor
989 }
990
991 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
992 info["name"] += "/"
993
994 return info
995
Victor Stinnerde629d42010-05-05 21:43:57 +0000996 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000997 """Return a tar header as a string of 512 byte blocks.
998 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000999 info = self.get_info()
Guido van Rossume7ba4952007-06-06 23:52:48 +00001000
Guido van Rossumd8faa362007-04-27 19:54:29 +00001001 if format == USTAR_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001002 return self.create_ustar_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001003 elif format == GNU_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001004 return self.create_gnu_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001005 elif format == PAX_FORMAT:
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001006 return self.create_pax_header(info, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001007 else:
1008 raise ValueError("invalid format")
1009
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001010 def create_ustar_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001011 """Return the object as a ustar header block.
1012 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001013 info["magic"] = POSIX_MAGIC
1014
1015 if len(info["linkname"]) > LENGTH_LINK:
1016 raise ValueError("linkname is too long")
1017
1018 if len(info["name"]) > LENGTH_NAME:
1019 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1020
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001021 return self._create_header(info, USTAR_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001022
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001023 def create_gnu_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001024 """Return the object as a GNU header block sequence.
1025 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001026 info["magic"] = GNU_MAGIC
1027
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001028 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +00001029 if len(info["linkname"]) > LENGTH_LINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001030 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001031
1032 if len(info["name"]) > LENGTH_NAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001033 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001034
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001035 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001036
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001037 def create_pax_header(self, info, encoding):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001038 """Return the object as a ustar header block. If it cannot be
1039 represented this way, prepend a pax extended header sequence
1040 with supplement information.
1041 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001042 info["magic"] = POSIX_MAGIC
1043 pax_headers = self.pax_headers.copy()
1044
1045 # Test string fields for values that exceed the field length or cannot
1046 # be represented in ASCII encoding.
1047 for name, hname, length in (
1048 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
1049 ("uname", "uname", 32), ("gname", "gname", 32)):
1050
Guido van Rossume7ba4952007-06-06 23:52:48 +00001051 if hname in pax_headers:
1052 # The pax header has priority.
1053 continue
1054
Guido van Rossumd8faa362007-04-27 19:54:29 +00001055 # Try to encode the string as ASCII.
1056 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001057 info[name].encode("ascii", "strict")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001058 except UnicodeEncodeError:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001059 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001060 continue
1061
Guido van Rossume7ba4952007-06-06 23:52:48 +00001062 if len(info[name]) > length:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001063 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001064
1065 # Test number fields for values that exceed the field limit or values
1066 # that like to be stored as float.
1067 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001068 if name in pax_headers:
1069 # The pax header has priority. Avoid overflow.
1070 info[name] = 0
1071 continue
1072
Guido van Rossumd8faa362007-04-27 19:54:29 +00001073 val = info[name]
1074 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001075 pax_headers[name] = str(val)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001076 info[name] = 0
1077
Guido van Rossume7ba4952007-06-06 23:52:48 +00001078 # Create a pax extended header if necessary.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001079 if pax_headers:
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001080 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001081 else:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001082 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +00001083
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001084 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001085
1086 @classmethod
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001087 def create_pax_global_header(cls, pax_headers):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001088 """Return the object as a pax global header block sequence.
1089 """
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001090 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf8")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001091
1092 def _posix_split_name(self, name):
1093 """Split a name longer than 100 chars into a prefix
1094 and a name part.
1095 """
1096 prefix = name[:LENGTH_PREFIX + 1]
1097 while prefix and prefix[-1] != "/":
1098 prefix = prefix[:-1]
1099
1100 name = name[len(prefix):]
1101 prefix = prefix[:-1]
1102
1103 if not prefix or len(name) > LENGTH_NAME:
1104 raise ValueError("name is too long")
1105 return prefix, name
1106
1107 @staticmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001108 def _create_header(info, format, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001109 """Return a header block. info is a dictionary with file
1110 information, format must be one of the *_FORMAT constants.
1111 """
1112 parts = [
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001113 stn(info.get("name", ""), 100, encoding, errors),
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001114 itn(info.get("mode", 0) & 0o7777, 8, format),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001115 itn(info.get("uid", 0), 8, format),
1116 itn(info.get("gid", 0), 8, format),
1117 itn(info.get("size", 0), 12, format),
1118 itn(info.get("mtime", 0), 12, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001119 b" ", # checksum field
Guido van Rossumd8faa362007-04-27 19:54:29 +00001120 info.get("type", REGTYPE),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001121 stn(info.get("linkname", ""), 100, encoding, errors),
1122 info.get("magic", POSIX_MAGIC),
Lars Gustäbel331b8002010-10-04 15:18:47 +00001123 stn(info.get("uname", ""), 32, encoding, errors),
1124 stn(info.get("gname", ""), 32, encoding, errors),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001125 itn(info.get("devmajor", 0), 8, format),
1126 itn(info.get("devminor", 0), 8, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001127 stn(info.get("prefix", ""), 155, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001128 ]
1129
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001130 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001131 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
Lars Gustäbela280ca752007-08-28 07:34:33 +00001132 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001133 return buf
1134
1135 @staticmethod
1136 def _create_payload(payload):
1137 """Return the string payload filled with zero bytes
1138 up to the next 512 byte border.
1139 """
1140 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1141 if remainder > 0:
1142 payload += (BLOCKSIZE - remainder) * NUL
1143 return payload
1144
1145 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001146 def _create_gnu_long_header(cls, name, type, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001147 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1148 for name.
1149 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001150 name = name.encode(encoding, errors) + NUL
Guido van Rossumd8faa362007-04-27 19:54:29 +00001151
1152 info = {}
1153 info["name"] = "././@LongLink"
1154 info["type"] = type
1155 info["size"] = len(name)
1156 info["magic"] = GNU_MAGIC
1157
1158 # create extended header + name blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001159 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001160 cls._create_payload(name)
1161
1162 @classmethod
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001163 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1164 """Return a POSIX.1-2008 extended or global header sequence
Guido van Rossumd8faa362007-04-27 19:54:29 +00001165 that contains a list of keyword, value pairs. The values
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001166 must be strings.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001167 """
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001168 # Check if one of the fields contains surrogate characters and thereby
1169 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1170 binary = False
1171 for keyword, value in pax_headers.items():
1172 try:
1173 value.encode("utf8", "strict")
1174 except UnicodeEncodeError:
1175 binary = True
1176 break
1177
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001178 records = b""
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001179 if binary:
1180 # Put the hdrcharset field at the beginning of the header.
1181 records += b"21 hdrcharset=BINARY\n"
1182
Guido van Rossumd8faa362007-04-27 19:54:29 +00001183 for keyword, value in pax_headers.items():
1184 keyword = keyword.encode("utf8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001185 if binary:
1186 # Try to restore the original byte representation of `value'.
1187 # Needless to say, that the encoding must match the string.
1188 value = value.encode(encoding, "surrogateescape")
1189 else:
1190 value = value.encode("utf8")
1191
Guido van Rossumd8faa362007-04-27 19:54:29 +00001192 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1193 n = p = 0
1194 while True:
1195 n = l + len(str(p))
1196 if n == p:
1197 break
1198 p = n
Lars Gustäbela280ca752007-08-28 07:34:33 +00001199 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
Guido van Rossumd8faa362007-04-27 19:54:29 +00001200
1201 # We use a hardcoded "././@PaxHeader" name like star does
1202 # instead of the one that POSIX recommends.
1203 info = {}
1204 info["name"] = "././@PaxHeader"
1205 info["type"] = type
1206 info["size"] = len(records)
1207 info["magic"] = POSIX_MAGIC
1208
1209 # Create pax header + record blocks.
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001210 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001211 cls._create_payload(records)
1212
Guido van Rossum75b64e62005-01-16 00:16:11 +00001213 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001214 def frombuf(cls, buf, encoding, errors):
1215 """Construct a TarInfo object from a 512 byte bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001216 """
Lars Gustäbel9520a432009-11-22 18:48:49 +00001217 if len(buf) == 0:
1218 raise EmptyHeaderError("empty header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001219 if len(buf) != BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001220 raise TruncatedHeaderError("truncated header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001221 if buf.count(NUL) == BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001222 raise EOFHeaderError("end of file header")
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001223
1224 chksum = nti(buf[148:156])
1225 if chksum not in calc_chksums(buf):
Lars Gustäbel9520a432009-11-22 18:48:49 +00001226 raise InvalidHeaderError("bad checksum")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001227
Guido van Rossumd8faa362007-04-27 19:54:29 +00001228 obj = cls()
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001229 obj.name = nts(buf[0:100], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001230 obj.mode = nti(buf[100:108])
1231 obj.uid = nti(buf[108:116])
1232 obj.gid = nti(buf[116:124])
1233 obj.size = nti(buf[124:136])
1234 obj.mtime = nti(buf[136:148])
1235 obj.chksum = chksum
1236 obj.type = buf[156:157]
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001237 obj.linkname = nts(buf[157:257], encoding, errors)
1238 obj.uname = nts(buf[265:297], encoding, errors)
1239 obj.gname = nts(buf[297:329], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001240 obj.devmajor = nti(buf[329:337])
1241 obj.devminor = nti(buf[337:345])
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001242 prefix = nts(buf[345:500], encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001243
Guido van Rossumd8faa362007-04-27 19:54:29 +00001244 # Old V7 tar format represents a directory as a regular
1245 # file with a trailing slash.
1246 if obj.type == AREGTYPE and obj.name.endswith("/"):
1247 obj.type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001248
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001249 # The old GNU sparse format occupies some of the unused
1250 # space in the buffer for up to 4 sparse structures.
1251 # Save the them for later processing in _proc_sparse().
1252 if obj.type == GNUTYPE_SPARSE:
1253 pos = 386
1254 structs = []
1255 for i in range(4):
1256 try:
1257 offset = nti(buf[pos:pos + 12])
1258 numbytes = nti(buf[pos + 12:pos + 24])
1259 except ValueError:
1260 break
1261 structs.append((offset, numbytes))
1262 pos += 24
1263 isextended = bool(buf[482])
1264 origsize = nti(buf[483:495])
1265 obj._sparse_structs = (structs, isextended, origsize)
1266
Guido van Rossumd8faa362007-04-27 19:54:29 +00001267 # Remove redundant slashes from directories.
1268 if obj.isdir():
1269 obj.name = obj.name.rstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001270
Guido van Rossumd8faa362007-04-27 19:54:29 +00001271 # Reconstruct a ustar longname.
1272 if prefix and obj.type not in GNU_TYPES:
1273 obj.name = prefix + "/" + obj.name
1274 return obj
1275
1276 @classmethod
1277 def fromtarfile(cls, tarfile):
1278 """Return the next TarInfo object from TarFile object
1279 tarfile.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001280 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001281 buf = tarfile.fileobj.read(BLOCKSIZE)
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001282 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001283 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1284 return obj._proc_member(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001285
Guido van Rossumd8faa362007-04-27 19:54:29 +00001286 #--------------------------------------------------------------------------
1287 # The following are methods that are called depending on the type of a
1288 # member. The entry point is _proc_member() which can be overridden in a
1289 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1290 # implement the following
1291 # operations:
1292 # 1. Set self.offset_data to the position where the data blocks begin,
1293 # if there is data that follows.
1294 # 2. Set tarfile.offset to the position where the next member's header will
1295 # begin.
1296 # 3. Return self or another valid TarInfo object.
1297 def _proc_member(self, tarfile):
1298 """Choose the right processing method depending on
1299 the type and call it.
Thomas Wouters89f507f2006-12-13 04:49:30 +00001300 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001301 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1302 return self._proc_gnulong(tarfile)
1303 elif self.type == GNUTYPE_SPARSE:
1304 return self._proc_sparse(tarfile)
1305 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1306 return self._proc_pax(tarfile)
1307 else:
1308 return self._proc_builtin(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001309
Guido van Rossumd8faa362007-04-27 19:54:29 +00001310 def _proc_builtin(self, tarfile):
1311 """Process a builtin type or an unknown type which
1312 will be treated as a regular file.
1313 """
1314 self.offset_data = tarfile.fileobj.tell()
1315 offset = self.offset_data
1316 if self.isreg() or self.type not in SUPPORTED_TYPES:
1317 # Skip the following data blocks.
1318 offset += self._block(self.size)
1319 tarfile.offset = offset
Thomas Wouters89f507f2006-12-13 04:49:30 +00001320
Guido van Rossume7ba4952007-06-06 23:52:48 +00001321 # Patch the TarInfo object with saved global
Guido van Rossumd8faa362007-04-27 19:54:29 +00001322 # header information.
Guido van Rossume7ba4952007-06-06 23:52:48 +00001323 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001324
1325 return self
1326
1327 def _proc_gnulong(self, tarfile):
1328 """Process the blocks that hold a GNU longname
1329 or longlink member.
1330 """
1331 buf = tarfile.fileobj.read(self._block(self.size))
1332
1333 # Fetch the next header and process it.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001334 try:
1335 next = self.fromtarfile(tarfile)
1336 except HeaderError:
1337 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001338
1339 # Patch the TarInfo object from the next header with
1340 # the longname information.
1341 next.offset = self.offset
1342 if self.type == GNUTYPE_LONGNAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001343 next.name = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001344 elif self.type == GNUTYPE_LONGLINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001345 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001346
1347 return next
1348
1349 def _proc_sparse(self, tarfile):
1350 """Process a GNU sparse header plus extra headers.
1351 """
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001352 # We already collected some sparse structures in frombuf().
1353 structs, isextended, origsize = self._sparse_structs
1354 del self._sparse_structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001355
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001356 # Collect sparse structures from extended header blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001357 while isextended:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001358 buf = tarfile.fileobj.read(BLOCKSIZE)
1359 pos = 0
Guido van Rossum805365e2007-05-07 22:24:25 +00001360 for i in range(21):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001361 try:
1362 offset = nti(buf[pos:pos + 12])
1363 numbytes = nti(buf[pos + 12:pos + 24])
1364 except ValueError:
1365 break
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001366 if offset and numbytes:
1367 structs.append((offset, numbytes))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001368 pos += 24
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001369 isextended = bool(buf[504])
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001370 self.sparse = structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001371
1372 self.offset_data = tarfile.fileobj.tell()
1373 tarfile.offset = self.offset_data + self._block(self.size)
1374 self.size = origsize
Guido van Rossumd8faa362007-04-27 19:54:29 +00001375 return self
1376
1377 def _proc_pax(self, tarfile):
1378 """Process an extended or global header as described in
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001379 POSIX.1-2008.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001380 """
1381 # Read the header information.
1382 buf = tarfile.fileobj.read(self._block(self.size))
1383
1384 # A pax header stores supplemental information for either
1385 # the following file (extended) or all following files
1386 # (global).
1387 if self.type == XGLTYPE:
1388 pax_headers = tarfile.pax_headers
1389 else:
1390 pax_headers = tarfile.pax_headers.copy()
1391
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001392 # Check if the pax header contains a hdrcharset field. This tells us
1393 # the encoding of the path, linkpath, uname and gname fields. Normally,
1394 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1395 # implementations are allowed to store them as raw binary strings if
1396 # the translation to UTF-8 fails.
1397 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1398 if match is not None:
1399 pax_headers["hdrcharset"] = match.group(1).decode("utf8")
1400
1401 # For the time being, we don't care about anything other than "BINARY".
1402 # The only other value that is currently allowed by the standard is
1403 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1404 hdrcharset = pax_headers.get("hdrcharset")
1405 if hdrcharset == "BINARY":
1406 encoding = tarfile.encoding
1407 else:
1408 encoding = "utf8"
1409
Guido van Rossumd8faa362007-04-27 19:54:29 +00001410 # Parse pax header information. A record looks like that:
1411 # "%d %s=%s\n" % (length, keyword, value). length is the size
1412 # of the complete record including the length field itself and
Guido van Rossume7ba4952007-06-06 23:52:48 +00001413 # the newline. keyword and value are both UTF-8 encoded strings.
Antoine Pitroufd036452008-08-19 17:56:33 +00001414 regex = re.compile(br"(\d+) ([^=]+)=")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001415 pos = 0
1416 while True:
1417 match = regex.match(buf, pos)
1418 if not match:
1419 break
1420
1421 length, keyword = match.groups()
1422 length = int(length)
1423 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1424
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001425 # Normally, we could just use "utf8" as the encoding and "strict"
1426 # as the error handler, but we better not take the risk. For
1427 # example, GNU tar <= 1.23 is known to store filenames it cannot
1428 # translate to UTF-8 as raw strings (unfortunately without a
1429 # hdrcharset=BINARY header).
1430 # We first try the strict standard encoding, and if that fails we
1431 # fall back on the user's encoding and error handler.
1432 keyword = self._decode_pax_field(keyword, "utf8", "utf8",
1433 tarfile.errors)
1434 if keyword in PAX_NAME_FIELDS:
1435 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1436 tarfile.errors)
1437 else:
1438 value = self._decode_pax_field(value, "utf8", "utf8",
1439 tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001440
1441 pax_headers[keyword] = value
1442 pos += length
1443
Guido van Rossume7ba4952007-06-06 23:52:48 +00001444 # Fetch the next header.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001445 try:
1446 next = self.fromtarfile(tarfile)
1447 except HeaderError:
1448 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001449
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001450 # Process GNU sparse information.
1451 if "GNU.sparse.map" in pax_headers:
1452 # GNU extended sparse format version 0.1.
1453 self._proc_gnusparse_01(next, pax_headers)
1454
1455 elif "GNU.sparse.size" in pax_headers:
1456 # GNU extended sparse format version 0.0.
1457 self._proc_gnusparse_00(next, pax_headers, buf)
1458
1459 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1460 # GNU extended sparse format version 1.0.
1461 self._proc_gnusparse_10(next, pax_headers, tarfile)
1462
Guido van Rossume7ba4952007-06-06 23:52:48 +00001463 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001464 # Patch the TarInfo object with the extended header info.
1465 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1466 next.offset = self.offset
1467
1468 if "size" in pax_headers:
1469 # If the extended header replaces the size field,
1470 # we need to recalculate the offset where the next
1471 # header starts.
1472 offset = next.offset_data
1473 if next.isreg() or next.type not in SUPPORTED_TYPES:
1474 offset += next._block(next.size)
1475 tarfile.offset = offset
1476
1477 return next
1478
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001479 def _proc_gnusparse_00(self, next, pax_headers, buf):
1480 """Process a GNU tar extended sparse header, version 0.0.
1481 """
1482 offsets = []
1483 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1484 offsets.append(int(match.group(1)))
1485 numbytes = []
1486 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1487 numbytes.append(int(match.group(1)))
1488 next.sparse = list(zip(offsets, numbytes))
1489
1490 def _proc_gnusparse_01(self, next, pax_headers):
1491 """Process a GNU tar extended sparse header, version 0.1.
1492 """
1493 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1494 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1495
1496 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1497 """Process a GNU tar extended sparse header, version 1.0.
1498 """
1499 fields = None
1500 sparse = []
1501 buf = tarfile.fileobj.read(BLOCKSIZE)
1502 fields, buf = buf.split(b"\n", 1)
1503 fields = int(fields)
1504 while len(sparse) < fields * 2:
1505 if b"\n" not in buf:
1506 buf += tarfile.fileobj.read(BLOCKSIZE)
1507 number, buf = buf.split(b"\n", 1)
1508 sparse.append(int(number))
1509 next.offset_data = tarfile.fileobj.tell()
1510 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1511
Guido van Rossume7ba4952007-06-06 23:52:48 +00001512 def _apply_pax_info(self, pax_headers, encoding, errors):
1513 """Replace fields with supplemental information from a previous
1514 pax extended or global header.
1515 """
1516 for keyword, value in pax_headers.items():
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001517 if keyword == "GNU.sparse.name":
1518 setattr(self, "path", value)
1519 elif keyword == "GNU.sparse.size":
1520 setattr(self, "size", int(value))
1521 elif keyword == "GNU.sparse.realsize":
1522 setattr(self, "size", int(value))
1523 elif keyword in PAX_FIELDS:
1524 if keyword in PAX_NUMBER_FIELDS:
1525 try:
1526 value = PAX_NUMBER_FIELDS[keyword](value)
1527 except ValueError:
1528 value = 0
1529 if keyword == "path":
1530 value = value.rstrip("/")
1531 setattr(self, keyword, value)
Guido van Rossume7ba4952007-06-06 23:52:48 +00001532
1533 self.pax_headers = pax_headers.copy()
Guido van Rossumd8faa362007-04-27 19:54:29 +00001534
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001535 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1536 """Decode a single field from a pax record.
1537 """
1538 try:
1539 return value.decode(encoding, "strict")
1540 except UnicodeDecodeError:
1541 return value.decode(fallback_encoding, fallback_errors)
1542
Guido van Rossumd8faa362007-04-27 19:54:29 +00001543 def _block(self, count):
1544 """Round up a byte count by BLOCKSIZE and return it,
1545 e.g. _block(834) => 1024.
1546 """
1547 blocks, remainder = divmod(count, BLOCKSIZE)
1548 if remainder:
1549 blocks += 1
1550 return blocks * BLOCKSIZE
Thomas Wouters89f507f2006-12-13 04:49:30 +00001551
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001552 def isreg(self):
1553 return self.type in REGULAR_TYPES
1554 def isfile(self):
1555 return self.isreg()
1556 def isdir(self):
1557 return self.type == DIRTYPE
1558 def issym(self):
1559 return self.type == SYMTYPE
1560 def islnk(self):
1561 return self.type == LNKTYPE
1562 def ischr(self):
1563 return self.type == CHRTYPE
1564 def isblk(self):
1565 return self.type == BLKTYPE
1566 def isfifo(self):
1567 return self.type == FIFOTYPE
1568 def issparse(self):
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001569 return self.sparse is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001570 def isdev(self):
1571 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1572# class TarInfo
1573
1574class TarFile(object):
1575 """The TarFile Class provides an interface to tar archives.
1576 """
1577
1578 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1579
1580 dereference = False # If true, add content of linked file to the
1581 # tar file, else the link.
1582
1583 ignore_zeros = False # If true, skips empty or invalid blocks and
1584 # continues processing.
1585
Lars Gustäbel365aff32009-12-13 11:42:29 +00001586 errorlevel = 1 # If 0, fatal errors only appear in debug
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001587 # messages (if debug >= 0). If > 0, errors
1588 # are passed to the caller as exceptions.
1589
Guido van Rossumd8faa362007-04-27 19:54:29 +00001590 format = DEFAULT_FORMAT # The format to use when creating an archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001591
Guido van Rossume7ba4952007-06-06 23:52:48 +00001592 encoding = ENCODING # Encoding for 8-bit character strings.
1593
1594 errors = None # Error handler for unicode conversion.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001595
Guido van Rossumd8faa362007-04-27 19:54:29 +00001596 tarinfo = TarInfo # The default TarInfo class to use.
1597
1598 fileobject = ExFileObject # The default ExFileObject class to use.
1599
1600 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1601 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
Victor Stinnerde629d42010-05-05 21:43:57 +00001602 errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001603 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1604 read from an existing archive, 'a' to append data to an existing
1605 file or 'w' to create a new file overwriting an existing one. `mode'
1606 defaults to 'r'.
1607 If `fileobj' is given, it is used for reading or writing data. If it
1608 can be determined, `mode' is overridden by `fileobj's mode.
1609 `fileobj' is not closed, when TarFile is closed.
1610 """
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001611 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001612 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001613 self.mode = mode
1614 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001615
1616 if not fileobj:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001617 if self.mode == "a" and not os.path.exists(name):
Thomas Wouterscf297e42007-02-23 15:07:44 +00001618 # Create nonexistent files in append mode.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001619 self.mode = "w"
1620 self._mode = "wb"
Guido van Rossume7ba4952007-06-06 23:52:48 +00001621 fileobj = bltn_open(name, self._mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001622 self._extfileobj = False
1623 else:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001624 if name is None and hasattr(fileobj, "name"):
1625 name = fileobj.name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001626 if hasattr(fileobj, "mode"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001627 self._mode = fileobj.mode
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001628 self._extfileobj = True
Thomas Woutersed03b412007-08-28 21:37:11 +00001629 self.name = os.path.abspath(name) if name else None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001630 self.fileobj = fileobj
1631
Guido van Rossumd8faa362007-04-27 19:54:29 +00001632 # Init attributes.
1633 if format is not None:
1634 self.format = format
1635 if tarinfo is not None:
1636 self.tarinfo = tarinfo
1637 if dereference is not None:
1638 self.dereference = dereference
1639 if ignore_zeros is not None:
1640 self.ignore_zeros = ignore_zeros
1641 if encoding is not None:
1642 self.encoding = encoding
Victor Stinnerde629d42010-05-05 21:43:57 +00001643 self.errors = errors
Guido van Rossume7ba4952007-06-06 23:52:48 +00001644
1645 if pax_headers is not None and self.format == PAX_FORMAT:
1646 self.pax_headers = pax_headers
1647 else:
1648 self.pax_headers = {}
1649
Guido van Rossumd8faa362007-04-27 19:54:29 +00001650 if debug is not None:
1651 self.debug = debug
1652 if errorlevel is not None:
1653 self.errorlevel = errorlevel
1654
1655 # Init datastructures.
Thomas Wouters477c8d52006-05-27 19:21:47 +00001656 self.closed = False
1657 self.members = [] # list of members as TarInfo objects
1658 self._loaded = False # flag if all members have been read
Christian Heimesd8654cf2007-12-02 15:22:16 +00001659 self.offset = self.fileobj.tell()
1660 # current position in the archive file
Thomas Wouters477c8d52006-05-27 19:21:47 +00001661 self.inodes = {} # dictionary caching the inodes of
1662 # archive members already added
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001663
Lars Gustäbel7b465392009-11-18 20:29:25 +00001664 try:
1665 if self.mode == "r":
1666 self.firstmember = None
1667 self.firstmember = self.next()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001668
Lars Gustäbel7b465392009-11-18 20:29:25 +00001669 if self.mode == "a":
1670 # Move to the end of the archive,
1671 # before the first empty block.
Lars Gustäbel7b465392009-11-18 20:29:25 +00001672 while True:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001673 self.fileobj.seek(self.offset)
1674 try:
1675 tarinfo = self.tarinfo.fromtarfile(self)
1676 self.members.append(tarinfo)
1677 except EOFHeaderError:
1678 self.fileobj.seek(self.offset)
Lars Gustäbel7b465392009-11-18 20:29:25 +00001679 break
Lars Gustäbel9520a432009-11-22 18:48:49 +00001680 except HeaderError as e:
1681 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001682
Lars Gustäbel7b465392009-11-18 20:29:25 +00001683 if self.mode in "aw":
1684 self._loaded = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001685
Lars Gustäbel7b465392009-11-18 20:29:25 +00001686 if self.pax_headers:
1687 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1688 self.fileobj.write(buf)
1689 self.offset += len(buf)
1690 except:
1691 if not self._extfileobj:
1692 self.fileobj.close()
1693 self.closed = True
1694 raise
Guido van Rossumd8faa362007-04-27 19:54:29 +00001695
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001696 #--------------------------------------------------------------------------
1697 # Below are the classmethods which act as alternate constructors to the
1698 # TarFile class. The open() method is the only one that is needed for
1699 # public use; it is the "super"-constructor and is able to select an
1700 # adequate "sub"-constructor for a particular compression using the mapping
1701 # from OPEN_METH.
1702 #
1703 # This concept allows one to subclass TarFile without losing the comfort of
1704 # the super-constructor. A sub-constructor is registered and made available
1705 # by adding it to the mapping in OPEN_METH.
1706
Guido van Rossum75b64e62005-01-16 00:16:11 +00001707 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001708 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001709 """Open a tar archive for reading, writing or appending. Return
1710 an appropriate TarFile class.
1711
1712 mode:
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001713 'r' or 'r:*' open for reading with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001714 'r:' open for reading exclusively uncompressed
1715 'r:gz' open for reading with gzip compression
1716 'r:bz2' open for reading with bzip2 compression
Thomas Wouterscf297e42007-02-23 15:07:44 +00001717 'a' or 'a:' open for appending, creating the file if necessary
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001718 'w' or 'w:' open for writing without compression
1719 'w:gz' open for writing with gzip compression
1720 'w:bz2' open for writing with bzip2 compression
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001721
1722 'r|*' open a stream of tar blocks with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001723 'r|' open an uncompressed stream of tar blocks for reading
1724 'r|gz' open a gzip compressed stream of tar blocks
1725 'r|bz2' open a bzip2 compressed stream of tar blocks
1726 'w|' open an uncompressed stream for writing
1727 'w|gz' open a gzip compressed stream for writing
1728 'w|bz2' open a bzip2 compressed stream for writing
1729 """
1730
1731 if not name and not fileobj:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001732 raise ValueError("nothing to open")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001733
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001734 if mode in ("r", "r:*"):
1735 # Find out which *open() is appropriate for opening the file.
1736 for comptype in cls.OPEN_METH:
1737 func = getattr(cls, cls.OPEN_METH[comptype])
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001738 if fileobj is not None:
1739 saved_pos = fileobj.tell()
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001740 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001741 return func(name, "r", fileobj, **kwargs)
1742 except (ReadError, CompressionError) as e:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001743 if fileobj is not None:
1744 fileobj.seek(saved_pos)
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001745 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001746 raise ReadError("file could not be opened successfully")
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001747
1748 elif ":" in mode:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001749 filemode, comptype = mode.split(":", 1)
1750 filemode = filemode or "r"
1751 comptype = comptype or "tar"
1752
1753 # Select the *open() function according to
1754 # given compression.
1755 if comptype in cls.OPEN_METH:
1756 func = getattr(cls, cls.OPEN_METH[comptype])
1757 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001758 raise CompressionError("unknown compression type %r" % comptype)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001759 return func(name, filemode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001760
1761 elif "|" in mode:
1762 filemode, comptype = mode.split("|", 1)
1763 filemode = filemode or "r"
1764 comptype = comptype or "tar"
1765
1766 if filemode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001767 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001768
Antoine Pitrou605c2932010-09-23 20:15:14 +00001769 stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1770 try:
1771 t = cls(name, filemode, stream, **kwargs)
1772 except:
1773 stream.close()
1774 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001775 t._extfileobj = False
1776 return t
1777
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001778 elif mode in "aw":
Guido van Rossumd8faa362007-04-27 19:54:29 +00001779 return cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001780
Thomas Wouters477c8d52006-05-27 19:21:47 +00001781 raise ValueError("undiscernible mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001782
Guido van Rossum75b64e62005-01-16 00:16:11 +00001783 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001784 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001785 """Open uncompressed tar archive name for reading or writing.
1786 """
1787 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001788 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001789 return cls(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001790
Guido van Rossum75b64e62005-01-16 00:16:11 +00001791 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001792 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001793 """Open gzip compressed tar archive name for reading or writing.
1794 Appending is not allowed.
1795 """
1796 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001797 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001798
1799 try:
1800 import gzip
Neal Norwitz4ec68242003-04-11 03:05:56 +00001801 gzip.GzipFile
1802 except (ImportError, AttributeError):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001803 raise CompressionError("gzip module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001804
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001805 extfileobj = fileobj is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001806 try:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001807 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
1808 t = cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001809 except IOError:
Georg Brandl3abb3722011-08-13 11:48:12 +02001810 if not extfileobj and fileobj is not None:
Antoine Pitrou95f55602010-09-23 18:36:46 +00001811 fileobj.close()
Georg Brandl3abb3722011-08-13 11:48:12 +02001812 if fileobj is None:
1813 raise
Thomas Wouters477c8d52006-05-27 19:21:47 +00001814 raise ReadError("not a gzip file")
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001815 except:
Georg Brandl3abb3722011-08-13 11:48:12 +02001816 if not extfileobj and fileobj is not None:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001817 fileobj.close()
1818 raise
Antoine Pitrou95f55602010-09-23 18:36:46 +00001819 t._extfileobj = extfileobj
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001820 return t
1821
Guido van Rossum75b64e62005-01-16 00:16:11 +00001822 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001823 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001824 """Open bzip2 compressed tar archive name for reading or writing.
1825 Appending is not allowed.
1826 """
1827 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001828 raise ValueError("mode must be 'r' or 'w'.")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001829
1830 try:
1831 import bz2
1832 except ImportError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001833 raise CompressionError("bz2 module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001834
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001835 if fileobj is not None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001836 fileobj = _BZ2Proxy(fileobj, mode)
1837 else:
1838 fileobj = bz2.BZ2File(name, mode, compresslevel=compresslevel)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001839
1840 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001841 t = cls.taropen(name, mode, fileobj, **kwargs)
Lars Gustäbel9520a432009-11-22 18:48:49 +00001842 except (IOError, EOFError):
Antoine Pitrou95f55602010-09-23 18:36:46 +00001843 fileobj.close()
Thomas Wouters477c8d52006-05-27 19:21:47 +00001844 raise ReadError("not a bzip2 file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001845 t._extfileobj = False
1846 return t
1847
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001848 # All *open() methods are registered here.
1849 OPEN_METH = {
1850 "tar": "taropen", # uncompressed tar
1851 "gz": "gzopen", # gzip compressed tar
1852 "bz2": "bz2open" # bzip2 compressed tar
1853 }
1854
1855 #--------------------------------------------------------------------------
1856 # The public methods which TarFile provides:
1857
1858 def close(self):
1859 """Close the TarFile. In write-mode, two finishing zero blocks are
1860 appended to the archive.
1861 """
1862 if self.closed:
1863 return
1864
Guido van Rossumd8faa362007-04-27 19:54:29 +00001865 if self.mode in "aw":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001866 self.fileobj.write(NUL * (BLOCKSIZE * 2))
1867 self.offset += (BLOCKSIZE * 2)
1868 # fill up the end with zero-blocks
1869 # (like option -b20 for tar does)
1870 blocks, remainder = divmod(self.offset, RECORDSIZE)
1871 if remainder > 0:
1872 self.fileobj.write(NUL * (RECORDSIZE - remainder))
1873
1874 if not self._extfileobj:
1875 self.fileobj.close()
1876 self.closed = True
1877
1878 def getmember(self, name):
1879 """Return a TarInfo object for member `name'. If `name' can not be
1880 found in the archive, KeyError is raised. If a member occurs more
Mark Dickinson934896d2009-02-21 20:59:32 +00001881 than once in the archive, its last occurrence is assumed to be the
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001882 most up-to-date version.
1883 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001884 tarinfo = self._getmember(name)
1885 if tarinfo is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001886 raise KeyError("filename %r not found" % name)
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001887 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001888
1889 def getmembers(self):
1890 """Return the members of the archive as a list of TarInfo objects. The
1891 list has the same order as the members in the archive.
1892 """
1893 self._check()
1894 if not self._loaded: # if we want to obtain a list of
1895 self._load() # all members, we first have to
1896 # scan the whole archive.
1897 return self.members
1898
1899 def getnames(self):
1900 """Return the members of the archive as a list of their names. It has
1901 the same order as the list returned by getmembers().
1902 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001903 return [tarinfo.name for tarinfo in self.getmembers()]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001904
1905 def gettarinfo(self, name=None, arcname=None, fileobj=None):
1906 """Create a TarInfo object for either the file `name' or the file
1907 object `fileobj' (using os.fstat on its file descriptor). You can
1908 modify some of the TarInfo's attributes before you add it using
1909 addfile(). If given, `arcname' specifies an alternative name for the
1910 file in the archive.
1911 """
1912 self._check("aw")
1913
1914 # When fileobj is given, replace name by
1915 # fileobj's real name.
1916 if fileobj is not None:
1917 name = fileobj.name
1918
1919 # Building the name of the member in the archive.
1920 # Backward slashes are converted to forward slashes,
1921 # Absolute paths are turned to relative paths.
1922 if arcname is None:
1923 arcname = name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001924 drv, arcname = os.path.splitdrive(arcname)
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00001925 arcname = arcname.replace(os.sep, "/")
1926 arcname = arcname.lstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001927
1928 # Now, fill the TarInfo object with
1929 # information specific for the file.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001930 tarinfo = self.tarinfo()
1931 tarinfo.tarfile = self
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001932
1933 # Use os.stat or os.lstat, depending on platform
1934 # and if symlinks shall be resolved.
1935 if fileobj is None:
1936 if hasattr(os, "lstat") and not self.dereference:
1937 statres = os.lstat(name)
1938 else:
1939 statres = os.stat(name)
1940 else:
1941 statres = os.fstat(fileobj.fileno())
1942 linkname = ""
1943
1944 stmd = statres.st_mode
1945 if stat.S_ISREG(stmd):
1946 inode = (statres.st_ino, statres.st_dev)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001947 if not self.dereference and statres.st_nlink > 1 and \
1948 inode in self.inodes and arcname != self.inodes[inode]:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001949 # Is it a hardlink to an already
1950 # archived file?
1951 type = LNKTYPE
1952 linkname = self.inodes[inode]
1953 else:
1954 # The inode is added only if its valid.
1955 # For win32 it is always 0.
1956 type = REGTYPE
1957 if inode[0]:
1958 self.inodes[inode] = arcname
1959 elif stat.S_ISDIR(stmd):
1960 type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001961 elif stat.S_ISFIFO(stmd):
1962 type = FIFOTYPE
1963 elif stat.S_ISLNK(stmd):
1964 type = SYMTYPE
1965 linkname = os.readlink(name)
1966 elif stat.S_ISCHR(stmd):
1967 type = CHRTYPE
1968 elif stat.S_ISBLK(stmd):
1969 type = BLKTYPE
1970 else:
1971 return None
1972
1973 # Fill the TarInfo object with all
1974 # information we can get.
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001975 tarinfo.name = arcname
1976 tarinfo.mode = stmd
1977 tarinfo.uid = statres.st_uid
1978 tarinfo.gid = statres.st_gid
Lars Gustäbel2470ff12010-06-03 10:11:52 +00001979 if type == REGTYPE:
Martin v. Löwis61d77e02004-08-20 06:35:46 +00001980 tarinfo.size = statres.st_size
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001981 else:
Guido van Rossume2a383d2007-01-15 16:59:06 +00001982 tarinfo.size = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001983 tarinfo.mtime = statres.st_mtime
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001984 tarinfo.type = type
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001985 tarinfo.linkname = linkname
1986 if pwd:
1987 try:
1988 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1989 except KeyError:
1990 pass
1991 if grp:
1992 try:
1993 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1994 except KeyError:
1995 pass
1996
1997 if type in (CHRTYPE, BLKTYPE):
1998 if hasattr(os, "major") and hasattr(os, "minor"):
1999 tarinfo.devmajor = os.major(statres.st_rdev)
2000 tarinfo.devminor = os.minor(statres.st_rdev)
2001 return tarinfo
2002
2003 def list(self, verbose=True):
2004 """Print a table of contents to sys.stdout. If `verbose' is False, only
2005 the names of the members are printed. If it is True, an `ls -l'-like
2006 output is produced.
2007 """
2008 self._check()
2009
2010 for tarinfo in self:
2011 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002012 print(filemode(tarinfo.mode), end=' ')
2013 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2014 tarinfo.gname or tarinfo.gid), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002015 if tarinfo.ischr() or tarinfo.isblk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002016 print("%10s" % ("%d,%d" \
2017 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002018 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002019 print("%10d" % tarinfo.size, end=' ')
2020 print("%d-%02d-%02d %02d:%02d:%02d" \
2021 % time.localtime(tarinfo.mtime)[:6], end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002022
Guido van Rossumd8faa362007-04-27 19:54:29 +00002023 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002024
2025 if verbose:
2026 if tarinfo.issym():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002027 print("->", tarinfo.linkname, end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002028 if tarinfo.islnk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002029 print("link to", tarinfo.linkname, end=' ')
2030 print()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002031
Raymond Hettingera63a3122011-01-26 20:34:14 +00002032 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002033 """Add the file `name' to the archive. `name' may be any type of file
2034 (directory, fifo, symbolic link, etc.). If given, `arcname'
2035 specifies an alternative name for the file in the archive.
2036 Directories are added recursively by default. This can be avoided by
Guido van Rossum486364b2007-06-30 05:01:58 +00002037 setting `recursive' to False. `exclude' is a function that should
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002038 return True for each filename to be excluded. `filter' is a function
2039 that expects a TarInfo object argument and returns the changed
2040 TarInfo object, if it returns None the TarInfo object will be
2041 excluded from the archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002042 """
2043 self._check("aw")
2044
2045 if arcname is None:
2046 arcname = name
2047
Guido van Rossum486364b2007-06-30 05:01:58 +00002048 # Exclude pathnames.
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002049 if exclude is not None:
2050 import warnings
2051 warnings.warn("use the filter argument instead",
2052 DeprecationWarning, 2)
2053 if exclude(name):
2054 self._dbg(2, "tarfile: Excluded %r" % name)
2055 return
Guido van Rossum486364b2007-06-30 05:01:58 +00002056
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002057 # Skip if somebody tries to archive the archive...
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002058 if self.name is not None and os.path.abspath(name) == self.name:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002059 self._dbg(2, "tarfile: Skipped %r" % name)
2060 return
2061
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002062 self._dbg(1, name)
2063
2064 # Create a TarInfo object from the file.
2065 tarinfo = self.gettarinfo(name, arcname)
2066
2067 if tarinfo is None:
2068 self._dbg(1, "tarfile: Unsupported type %r" % name)
2069 return
2070
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002071 # Change or exclude the TarInfo object.
2072 if filter is not None:
2073 tarinfo = filter(tarinfo)
2074 if tarinfo is None:
2075 self._dbg(2, "tarfile: Excluded %r" % name)
2076 return
2077
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002078 # Append the tar header and data to the archive.
2079 if tarinfo.isreg():
Andrew Svetlov718df1d2012-11-29 14:20:47 +02002080 with bltn_open(name, "rb") as f:
2081 self.addfile(tarinfo, f)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002082
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002083 elif tarinfo.isdir():
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002084 self.addfile(tarinfo)
2085 if recursive:
2086 for f in os.listdir(name):
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002087 self.add(os.path.join(name, f), os.path.join(arcname, f),
Raymond Hettingera63a3122011-01-26 20:34:14 +00002088 recursive, exclude, filter=filter)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002089
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002090 else:
2091 self.addfile(tarinfo)
2092
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002093 def addfile(self, tarinfo, fileobj=None):
2094 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2095 given, tarinfo.size bytes are read from it and added to the archive.
2096 You can create TarInfo objects using gettarinfo().
2097 On Windows platforms, `fileobj' should always be opened with mode
2098 'rb' to avoid irritation about the file size.
2099 """
2100 self._check("aw")
2101
Thomas Wouters89f507f2006-12-13 04:49:30 +00002102 tarinfo = copy.copy(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002103
Guido van Rossume7ba4952007-06-06 23:52:48 +00002104 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002105 self.fileobj.write(buf)
2106 self.offset += len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002107
2108 # If there's data to follow, append it.
2109 if fileobj is not None:
2110 copyfileobj(fileobj, self.fileobj, tarinfo.size)
2111 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
2112 if remainder > 0:
2113 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2114 blocks += 1
2115 self.offset += blocks * BLOCKSIZE
2116
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002117 self.members.append(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002118
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002119 def extractall(self, path=".", members=None):
2120 """Extract all members from the archive to the current working
2121 directory and set owner, modification time and permissions on
2122 directories afterwards. `path' specifies a different directory
2123 to extract to. `members' is optional and must be a subset of the
2124 list returned by getmembers().
2125 """
2126 directories = []
2127
2128 if members is None:
2129 members = self
2130
2131 for tarinfo in members:
2132 if tarinfo.isdir():
Christian Heimes2202f872008-02-06 14:31:34 +00002133 # Extract directories with a safe mode.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002134 directories.append(tarinfo)
Christian Heimes2202f872008-02-06 14:31:34 +00002135 tarinfo = copy.copy(tarinfo)
2136 tarinfo.mode = 0o700
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002137 # Do not set_attrs directories, as we will do that further down
2138 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002139
2140 # Reverse sort directories.
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +00002141 directories.sort(key=lambda a: a.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002142 directories.reverse()
2143
2144 # Set correct owner, mtime and filemode on directories.
2145 for tarinfo in directories:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002146 dirpath = os.path.join(path, tarinfo.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002147 try:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002148 self.chown(tarinfo, dirpath)
2149 self.utime(tarinfo, dirpath)
2150 self.chmod(tarinfo, dirpath)
Guido van Rossumb940e112007-01-10 16:19:56 +00002151 except ExtractError as e:
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002152 if self.errorlevel > 1:
2153 raise
2154 else:
2155 self._dbg(1, "tarfile: %s" % e)
2156
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002157 def extract(self, member, path="", set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002158 """Extract a member from the archive to the current working directory,
2159 using its full name. Its file information is extracted as accurately
2160 as possible. `member' may be a filename or a TarInfo object. You can
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002161 specify a different directory using `path'. File attributes (owner,
2162 mtime, mode) are set unless `set_attrs' is False.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002163 """
2164 self._check("r")
2165
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002166 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002167 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002168 else:
2169 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002170
Neal Norwitza4f651a2004-07-20 22:07:44 +00002171 # Prepare the link target for makelink().
2172 if tarinfo.islnk():
2173 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2174
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002175 try:
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002176 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2177 set_attrs=set_attrs)
Guido van Rossumb940e112007-01-10 16:19:56 +00002178 except EnvironmentError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002179 if self.errorlevel > 0:
2180 raise
2181 else:
2182 if e.filename is None:
2183 self._dbg(1, "tarfile: %s" % e.strerror)
2184 else:
2185 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
Guido van Rossumb940e112007-01-10 16:19:56 +00002186 except ExtractError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002187 if self.errorlevel > 1:
2188 raise
2189 else:
2190 self._dbg(1, "tarfile: %s" % e)
2191
2192 def extractfile(self, member):
2193 """Extract a member from the archive as a file object. `member' may be
2194 a filename or a TarInfo object. If `member' is a regular file, a
2195 file-like object is returned. If `member' is a link, a file-like
2196 object is constructed from the link's target. If `member' is none of
2197 the above, None is returned.
2198 The file-like object is read-only and provides the following
2199 methods: read(), readline(), readlines(), seek() and tell()
2200 """
2201 self._check("r")
2202
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002203 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002204 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002205 else:
2206 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002207
2208 if tarinfo.isreg():
2209 return self.fileobject(self, tarinfo)
2210
2211 elif tarinfo.type not in SUPPORTED_TYPES:
2212 # If a member's type is unknown, it is treated as a
2213 # regular file.
2214 return self.fileobject(self, tarinfo)
2215
2216 elif tarinfo.islnk() or tarinfo.issym():
2217 if isinstance(self.fileobj, _Stream):
2218 # A small but ugly workaround for the case that someone tries
2219 # to extract a (sym)link as a file-object from a non-seekable
2220 # stream of tar blocks.
Thomas Wouters477c8d52006-05-27 19:21:47 +00002221 raise StreamError("cannot extract (sym)link as file object")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002222 else:
Georg Brandl7eb4b7d2005-07-22 21:49:32 +00002223 # A (sym)link's file object is its target's file object.
Lars Gustäbel1b512722010-06-03 12:45:16 +00002224 return self.extractfile(self._find_link_target(tarinfo))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002225 else:
2226 # If there's no data associated with the member (directory, chrdev,
2227 # blkdev, etc.), return None instead of a file object.
2228 return None
2229
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002230 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002231 """Extract the TarInfo object tarinfo to a physical
2232 file called targetpath.
2233 """
2234 # Fetch the TarInfo object for the given name
2235 # and build the destination pathname, replacing
2236 # forward slashes to platform specific separators.
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002237 targetpath = targetpath.rstrip("/")
2238 targetpath = targetpath.replace("/", os.sep)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002239
2240 # Create all upper directories.
2241 upperdirs = os.path.dirname(targetpath)
2242 if upperdirs and not os.path.exists(upperdirs):
Christian Heimes2202f872008-02-06 14:31:34 +00002243 # Create directories that are not part of the archive with
2244 # default permissions.
Thomas Woutersb2137042007-02-01 18:02:27 +00002245 os.makedirs(upperdirs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002246
2247 if tarinfo.islnk() or tarinfo.issym():
2248 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2249 else:
2250 self._dbg(1, tarinfo.name)
2251
2252 if tarinfo.isreg():
2253 self.makefile(tarinfo, targetpath)
2254 elif tarinfo.isdir():
2255 self.makedir(tarinfo, targetpath)
2256 elif tarinfo.isfifo():
2257 self.makefifo(tarinfo, targetpath)
2258 elif tarinfo.ischr() or tarinfo.isblk():
2259 self.makedev(tarinfo, targetpath)
2260 elif tarinfo.islnk() or tarinfo.issym():
2261 self.makelink(tarinfo, targetpath)
2262 elif tarinfo.type not in SUPPORTED_TYPES:
2263 self.makeunknown(tarinfo, targetpath)
2264 else:
2265 self.makefile(tarinfo, targetpath)
2266
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002267 if set_attrs:
2268 self.chown(tarinfo, targetpath)
2269 if not tarinfo.issym():
2270 self.chmod(tarinfo, targetpath)
2271 self.utime(tarinfo, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002272
2273 #--------------------------------------------------------------------------
2274 # Below are the different file methods. They are called via
2275 # _extract_member() when extract() is called. They can be replaced in a
2276 # subclass to implement other functionality.
2277
2278 def makedir(self, tarinfo, targetpath):
2279 """Make a directory called targetpath.
2280 """
2281 try:
Christian Heimes2202f872008-02-06 14:31:34 +00002282 # Use a safe mode for the directory, the real mode is set
2283 # later in _extract_member().
2284 os.mkdir(targetpath, 0o700)
Guido van Rossumb940e112007-01-10 16:19:56 +00002285 except EnvironmentError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002286 if e.errno != errno.EEXIST:
2287 raise
2288
2289 def makefile(self, tarinfo, targetpath):
2290 """Make a file called targetpath.
2291 """
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002292 source = self.fileobj
2293 source.seek(tarinfo.offset_data)
Andrew Svetlov718df1d2012-11-29 14:20:47 +02002294 with bltn_open(targetpath, "wb") as target:
2295 if tarinfo.sparse is not None:
2296 for offset, size in tarinfo.sparse:
2297 target.seek(offset)
2298 copyfileobj(source, target, size)
2299 else:
2300 copyfileobj(source, target, tarinfo.size)
2301 target.seek(tarinfo.size)
2302 target.truncate()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002303
2304 def makeunknown(self, tarinfo, targetpath):
2305 """Make a file from a TarInfo object with an unknown type
2306 at targetpath.
2307 """
2308 self.makefile(tarinfo, targetpath)
2309 self._dbg(1, "tarfile: Unknown file type %r, " \
2310 "extracted as regular file." % tarinfo.type)
2311
2312 def makefifo(self, tarinfo, targetpath):
2313 """Make a fifo called targetpath.
2314 """
2315 if hasattr(os, "mkfifo"):
2316 os.mkfifo(targetpath)
2317 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002318 raise ExtractError("fifo not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002319
2320 def makedev(self, tarinfo, targetpath):
2321 """Make a character or block device called targetpath.
2322 """
2323 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00002324 raise ExtractError("special devices not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002325
2326 mode = tarinfo.mode
2327 if tarinfo.isblk():
2328 mode |= stat.S_IFBLK
2329 else:
2330 mode |= stat.S_IFCHR
2331
2332 os.mknod(targetpath, mode,
2333 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2334
2335 def makelink(self, tarinfo, targetpath):
2336 """Make a (symbolic) link called targetpath. If it cannot be created
2337 (platform limitation), we try to make a copy of the referenced file
2338 instead of a link.
2339 """
Brian Curtind40e6f72010-07-08 21:39:08 +00002340 try:
Lars Gustäbel1b512722010-06-03 12:45:16 +00002341 # For systems that support symbolic and hard links.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002342 if tarinfo.issym():
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002343 os.symlink(tarinfo.linkname, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002344 else:
Neal Norwitza4f651a2004-07-20 22:07:44 +00002345 # See extract().
Lars Gustäbel1b512722010-06-03 12:45:16 +00002346 if os.path.exists(tarinfo._link_target):
2347 os.link(tarinfo._link_target, targetpath)
2348 else:
Brian Curtin82df53e2010-09-24 21:04:05 +00002349 self._extract_member(self._find_link_target(tarinfo),
2350 targetpath)
Brian Curtin16633fa2010-07-09 13:54:27 +00002351 except symlink_exception:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002352 try:
Brian Curtin16633fa2010-07-09 13:54:27 +00002353 self._extract_member(self._find_link_target(tarinfo),
2354 targetpath)
Lars Gustäbel1b512722010-06-03 12:45:16 +00002355 except KeyError:
2356 raise ExtractError("unable to resolve link inside archive")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002357
2358 def chown(self, tarinfo, targetpath):
2359 """Set owner of targetpath according to tarinfo.
2360 """
2361 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2362 # We have to be root to do so.
2363 try:
2364 g = grp.getgrnam(tarinfo.gname)[2]
2365 except KeyError:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002366 g = tarinfo.gid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002367 try:
2368 u = pwd.getpwnam(tarinfo.uname)[2]
2369 except KeyError:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002370 u = tarinfo.uid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002371 try:
2372 if tarinfo.issym() and hasattr(os, "lchown"):
2373 os.lchown(targetpath, u, g)
2374 else:
Andrew MacIntyre7970d202003-02-19 12:51:34 +00002375 if sys.platform != "os2emx":
2376 os.chown(targetpath, u, g)
Guido van Rossumb940e112007-01-10 16:19:56 +00002377 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002378 raise ExtractError("could not change owner")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002379
2380 def chmod(self, tarinfo, targetpath):
2381 """Set file permissions of targetpath according to tarinfo.
2382 """
Jack Jansen834eff62003-03-07 12:47:06 +00002383 if hasattr(os, 'chmod'):
2384 try:
2385 os.chmod(targetpath, tarinfo.mode)
Guido van Rossumb940e112007-01-10 16:19:56 +00002386 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002387 raise ExtractError("could not change mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002388
2389 def utime(self, tarinfo, targetpath):
2390 """Set modification time of targetpath according to tarinfo.
2391 """
Jack Jansen834eff62003-03-07 12:47:06 +00002392 if not hasattr(os, 'utime'):
Tim Petersf9347782003-03-07 15:36:41 +00002393 return
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002394 try:
2395 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
Guido van Rossumb940e112007-01-10 16:19:56 +00002396 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002397 raise ExtractError("could not change modification time")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002398
2399 #--------------------------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002400 def next(self):
2401 """Return the next member of the archive as a TarInfo object, when
2402 TarFile is opened for reading. Return None if there is no more
2403 available.
2404 """
2405 self._check("ra")
2406 if self.firstmember is not None:
2407 m = self.firstmember
2408 self.firstmember = None
2409 return m
2410
2411 # Read the next block.
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002412 self.fileobj.seek(self.offset)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002413 tarinfo = None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002414 while True:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002415 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00002416 tarinfo = self.tarinfo.fromtarfile(self)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002417 except EOFHeaderError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002418 if self.ignore_zeros:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002419 self._dbg(2, "0x%X: %s" % (self.offset, e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002420 self.offset += BLOCKSIZE
2421 continue
Lars Gustäbel9520a432009-11-22 18:48:49 +00002422 except InvalidHeaderError as e:
2423 if self.ignore_zeros:
2424 self._dbg(2, "0x%X: %s" % (self.offset, e))
2425 self.offset += BLOCKSIZE
2426 continue
2427 elif self.offset == 0:
2428 raise ReadError(str(e))
2429 except EmptyHeaderError:
2430 if self.offset == 0:
2431 raise ReadError("empty file")
2432 except TruncatedHeaderError as e:
2433 if self.offset == 0:
2434 raise ReadError(str(e))
2435 except SubsequentHeaderError as e:
2436 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002437 break
2438
Lars Gustäbel9520a432009-11-22 18:48:49 +00002439 if tarinfo is not None:
2440 self.members.append(tarinfo)
2441 else:
2442 self._loaded = True
2443
Thomas Wouters477c8d52006-05-27 19:21:47 +00002444 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002445
2446 #--------------------------------------------------------------------------
2447 # Little helper methods:
2448
Lars Gustäbel1b512722010-06-03 12:45:16 +00002449 def _getmember(self, name, tarinfo=None, normalize=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002450 """Find an archive member by name from bottom to top.
2451 If tarinfo is given, it is used as the starting point.
2452 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002453 # Ensure that all members have been loaded.
2454 members = self.getmembers()
2455
Lars Gustäbel1b512722010-06-03 12:45:16 +00002456 # Limit the member search list up to tarinfo.
2457 if tarinfo is not None:
2458 members = members[:members.index(tarinfo)]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002459
Lars Gustäbel1b512722010-06-03 12:45:16 +00002460 if normalize:
2461 name = os.path.normpath(name)
2462
2463 for member in reversed(members):
2464 if normalize:
2465 member_name = os.path.normpath(member.name)
2466 else:
2467 member_name = member.name
2468
2469 if name == member_name:
2470 return member
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002471
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002472 def _load(self):
2473 """Read through the entire archive file and look for readable
2474 members.
2475 """
2476 while True:
2477 tarinfo = self.next()
2478 if tarinfo is None:
2479 break
2480 self._loaded = True
2481
2482 def _check(self, mode=None):
2483 """Check if TarFile is still open, and if the operation's mode
2484 corresponds to TarFile's mode.
2485 """
2486 if self.closed:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002487 raise IOError("%s is closed" % self.__class__.__name__)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002488 if mode is not None and self.mode not in mode:
2489 raise IOError("bad operation for mode %r" % self.mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002490
Lars Gustäbel1b512722010-06-03 12:45:16 +00002491 def _find_link_target(self, tarinfo):
2492 """Find the target member of a symlink or hardlink member in the
2493 archive.
2494 """
2495 if tarinfo.issym():
2496 # Always search the entire archive.
Lars Gustäbel1ef9eda2012-04-24 21:04:40 +02002497 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
Lars Gustäbel1b512722010-06-03 12:45:16 +00002498 limit = None
2499 else:
2500 # Search the archive before the link, because a hard link is
2501 # just a reference to an already archived file.
2502 linkname = tarinfo.linkname
2503 limit = tarinfo
2504
2505 member = self._getmember(linkname, tarinfo=limit, normalize=True)
2506 if member is None:
2507 raise KeyError("linkname %r not found" % linkname)
2508 return member
2509
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002510 def __iter__(self):
2511 """Provide an iterator object.
2512 """
2513 if self._loaded:
2514 return iter(self.members)
2515 else:
2516 return TarIter(self)
2517
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002518 def _dbg(self, level, msg):
2519 """Write debugging output to sys.stderr.
2520 """
2521 if level <= self.debug:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002522 print(msg, file=sys.stderr)
Lars Gustäbel01385812010-03-03 12:08:54 +00002523
2524 def __enter__(self):
2525 self._check()
2526 return self
2527
2528 def __exit__(self, type, value, traceback):
2529 if type is None:
2530 self.close()
2531 else:
2532 # An exception occurred. We must not call close() because
2533 # it would try to write end-of-archive blocks and padding.
2534 if not self._extfileobj:
2535 self.fileobj.close()
2536 self.closed = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002537# class TarFile
2538
2539class TarIter:
2540 """Iterator Class.
2541
2542 for tarinfo in TarFile(...):
2543 suite...
2544 """
2545
2546 def __init__(self, tarfile):
2547 """Construct a TarIter object.
2548 """
2549 self.tarfile = tarfile
Martin v. Löwis637431b2005-03-03 23:12:42 +00002550 self.index = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002551 def __iter__(self):
2552 """Return iterator object.
2553 """
2554 return self
Georg Brandla18af4e2007-04-21 15:47:16 +00002555 def __next__(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002556 """Return the next item using TarFile's next() method.
2557 When all members have been read, set TarFile as _loaded.
2558 """
Martin v. Löwis637431b2005-03-03 23:12:42 +00002559 # Fix for SF #1100429: Under rare circumstances it can
2560 # happen that getmembers() is called during iteration,
2561 # which will cause TarIter to stop prematurely.
2562 if not self.tarfile._loaded:
2563 tarinfo = self.tarfile.next()
2564 if not tarinfo:
2565 self.tarfile._loaded = True
2566 raise StopIteration
2567 else:
2568 try:
2569 tarinfo = self.tarfile.members[self.index]
2570 except IndexError:
2571 raise StopIteration
2572 self.index += 1
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002573 return tarinfo
2574
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002575#--------------------
2576# exported functions
2577#--------------------
2578def is_tarfile(name):
2579 """Return True if name points to a tar archive that we
2580 are able to handle, else return False.
2581 """
2582 try:
2583 t = open(name)
2584 t.close()
2585 return True
2586 except TarError:
2587 return False
2588
Guido van Rossume7ba4952007-06-06 23:52:48 +00002589bltn_open = open
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002590open = TarFile.open