blob: 39dc1f185a1058c7e6bacb29dfea6c615398c73f [file] [log] [blame]
Benjamin Peterson90f5ba52010-03-11 22:53:45 +00001#!/usr/bin/env python3
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
Christian Heimes9c1257e2007-11-04 11:37:22 +00005# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00006# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
Guido van Rossumd8faa362007-04-27 19:54:29 +000032version = "0.9.0"
Guido van Rossum98297ee2007-11-06 21:34:58 +000033__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)"
Senthil Kumaran7c9719c2011-07-28 22:32:49 +080034__date__ = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $"
35__cvsid__ = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $"
Guido van Rossum98297ee2007-11-06 21:34:58 +000036__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000037
38#---------
39# Imports
40#---------
41import sys
42import os
43import shutil
44import stat
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000045import time
46import struct
Thomas Wouters89f507f2006-12-13 04:49:30 +000047import copy
Guido van Rossumd8faa362007-04-27 19:54:29 +000048import re
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000049
50try:
51 import grp, pwd
52except ImportError:
53 grp = pwd = None
54
Brian Curtin16633fa2010-07-09 13:54:27 +000055# os.symlink on Windows prior to 6.0 raises NotImplementedError
56symlink_exception = (AttributeError, NotImplementedError)
57try:
58 # WindowsError (1314) will be raised if the caller does not hold the
59 # SeCreateSymbolicLinkPrivilege privilege
60 symlink_exception += (WindowsError,)
61except NameError:
62 pass
63
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000064# from tarfile import *
65__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
66
Georg Brandl1a3284e2007-12-02 09:40:06 +000067from builtins import open as _open # Since 'open' is TarFile.open
Guido van Rossum8f78fe92006-08-24 04:03:53 +000068
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000069#---------------------------------------------------------
70# tar constants
71#---------------------------------------------------------
Lars Gustäbelb506dc32007-08-07 18:36:16 +000072NUL = b"\0" # the null character
Guido van Rossumd8faa362007-04-27 19:54:29 +000073BLOCKSIZE = 512 # length of processing blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000074RECORDSIZE = BLOCKSIZE * 20 # length of records
Lars Gustäbelb506dc32007-08-07 18:36:16 +000075GNU_MAGIC = b"ustar \0" # magic gnu tar string
76POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000077
Guido van Rossumd8faa362007-04-27 19:54:29 +000078LENGTH_NAME = 100 # maximum length of a filename
79LENGTH_LINK = 100 # maximum length of a linkname
80LENGTH_PREFIX = 155 # maximum length of the prefix field
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000081
Lars Gustäbelb506dc32007-08-07 18:36:16 +000082REGTYPE = b"0" # regular file
83AREGTYPE = b"\0" # regular file
84LNKTYPE = b"1" # link (inside tarfile)
85SYMTYPE = b"2" # symbolic link
86CHRTYPE = b"3" # character special device
87BLKTYPE = b"4" # block special device
88DIRTYPE = b"5" # directory
89FIFOTYPE = b"6" # fifo special device
90CONTTYPE = b"7" # contiguous file
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +000091
Lars Gustäbelb506dc32007-08-07 18:36:16 +000092GNUTYPE_LONGNAME = b"L" # GNU tar longname
93GNUTYPE_LONGLINK = b"K" # GNU tar longlink
94GNUTYPE_SPARSE = b"S" # GNU tar sparse file
Guido van Rossumd8faa362007-04-27 19:54:29 +000095
Lars Gustäbelb506dc32007-08-07 18:36:16 +000096XHDTYPE = b"x" # POSIX.1-2001 extended header
97XGLTYPE = b"g" # POSIX.1-2001 global header
98SOLARIS_XHDTYPE = b"X" # Solaris extended header
Guido van Rossumd8faa362007-04-27 19:54:29 +000099
100USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
101GNU_FORMAT = 1 # GNU tar format
102PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
103DEFAULT_FORMAT = GNU_FORMAT
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000104
105#---------------------------------------------------------
106# tarfile constants
107#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000108# File types that tarfile supports:
109SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
110 SYMTYPE, DIRTYPE, FIFOTYPE,
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000111 CONTTYPE, CHRTYPE, BLKTYPE,
112 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
113 GNUTYPE_SPARSE)
114
Guido van Rossumd8faa362007-04-27 19:54:29 +0000115# File types that will be treated as a regular file.
116REGULAR_TYPES = (REGTYPE, AREGTYPE,
117 CONTTYPE, GNUTYPE_SPARSE)
118
119# File types that are part of the GNU tar format.
120GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
121 GNUTYPE_SPARSE)
122
123# Fields from a pax header that override a TarInfo attribute.
124PAX_FIELDS = ("path", "linkpath", "size", "mtime",
125 "uid", "gid", "uname", "gname")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000126
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000127# Fields from a pax header that are affected by hdrcharset.
128PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
129
Guido van Rossume7ba4952007-06-06 23:52:48 +0000130# Fields in a pax header that are numbers, all other fields
131# are treated as strings.
132PAX_NUMBER_FIELDS = {
133 "atime": float,
134 "ctime": float,
135 "mtime": float,
136 "uid": int,
137 "gid": int,
138 "size": int
139}
140
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000141#---------------------------------------------------------
142# Bits used in the mode field, values in octal.
143#---------------------------------------------------------
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000144S_IFLNK = 0o120000 # symbolic link
145S_IFREG = 0o100000 # regular file
146S_IFBLK = 0o060000 # block device
147S_IFDIR = 0o040000 # directory
148S_IFCHR = 0o020000 # character device
149S_IFIFO = 0o010000 # fifo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000150
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000151TSUID = 0o4000 # set UID on execution
152TSGID = 0o2000 # set GID on execution
153TSVTX = 0o1000 # reserved
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000154
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000155TUREAD = 0o400 # read by owner
156TUWRITE = 0o200 # write by owner
157TUEXEC = 0o100 # execute/search by owner
158TGREAD = 0o040 # read by group
159TGWRITE = 0o020 # write by group
160TGEXEC = 0o010 # execute/search by group
161TOREAD = 0o004 # read by other
162TOWRITE = 0o002 # write by other
163TOEXEC = 0o001 # execute/search by other
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000164
165#---------------------------------------------------------
Guido van Rossumd8faa362007-04-27 19:54:29 +0000166# initialization
167#---------------------------------------------------------
Victor Stinner0f35e2c2010-06-11 23:46:47 +0000168if os.name in ("nt", "ce"):
169 ENCODING = "utf-8"
170else:
171 ENCODING = sys.getfilesystemencoding()
Guido van Rossumd8faa362007-04-27 19:54:29 +0000172
173#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000174# Some useful functions
175#---------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000176
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000177def stn(s, length, encoding, errors):
178 """Convert a string to a null-terminated bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000179 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000180 s = s.encode(encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000181 return s[:length] + (length - len(s)) * NUL
Thomas Wouters477c8d52006-05-27 19:21:47 +0000182
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000183def nts(s, encoding, errors):
184 """Convert a null-terminated bytes object to a string.
Guido van Rossumd8faa362007-04-27 19:54:29 +0000185 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000186 p = s.find(b"\0")
187 if p != -1:
188 s = s[:p]
189 return s.decode(encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000190
Thomas Wouters477c8d52006-05-27 19:21:47 +0000191def nti(s):
192 """Convert a number field to a python number.
193 """
194 # There are two possible encodings for a number field, see
195 # itn() below.
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200196 if s[0] in (0o200, 0o377):
197 n = 0
198 for i in range(len(s) - 1):
199 n <<= 8
200 n += s[i + 1]
201 if s[0] == 0o377:
202 n = -(256 ** (len(s) - 1) - n)
203 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000204 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000205 n = int(nts(s, "ascii", "strict") or "0", 8)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000206 except ValueError:
Lars Gustäbel9520a432009-11-22 18:48:49 +0000207 raise InvalidHeaderError("invalid header")
Thomas Wouters477c8d52006-05-27 19:21:47 +0000208 return n
209
Guido van Rossumd8faa362007-04-27 19:54:29 +0000210def itn(n, digits=8, format=DEFAULT_FORMAT):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000211 """Convert a python number to a number field.
212 """
213 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
214 # octal digits followed by a null-byte, this allows values up to
215 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200216 # that if necessary. A leading 0o200 or 0o377 byte indicate this
217 # particular encoding, the following digits-1 bytes are a big-endian
218 # base-256 representation. This allows values up to (256**(digits-1))-1.
219 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
220 # number.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221 if 0 <= n < 8 ** (digits - 1):
Lars Gustäbela280ca752007-08-28 07:34:33 +0000222 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200223 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
224 if n >= 0:
225 s = bytearray([0o200])
226 else:
227 s = bytearray([0o377])
228 n = 256 ** digits + n
Thomas Wouters477c8d52006-05-27 19:21:47 +0000229
Guido van Rossum805365e2007-05-07 22:24:25 +0000230 for i in range(digits - 1):
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200231 s.insert(1, n & 0o377)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 n >>= 8
Lars Gustäbelac3d1372011-10-14 12:46:40 +0200233 else:
234 raise ValueError("overflow in number field")
235
Thomas Wouters477c8d52006-05-27 19:21:47 +0000236 return s
237
238def calc_chksums(buf):
239 """Calculate the checksum for a member's header by summing up all
240 characters except for the chksum field which is treated as if
241 it was filled with spaces. According to the GNU tar sources,
242 some tars (Sun and NeXT) calculate chksum with signed char,
243 which will be different if there are chars in the buffer with
244 the high bit set. So we calculate two checksums, unsigned and
245 signed.
246 """
247 unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
248 signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
249 return unsigned_chksum, signed_chksum
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000250
251def copyfileobj(src, dst, length=None):
252 """Copy length bytes from fileobj src to fileobj dst.
253 If length is None, copy the entire content.
254 """
255 if length == 0:
256 return
257 if length is None:
258 shutil.copyfileobj(src, dst)
259 return
260
261 BUFSIZE = 16 * 1024
262 blocks, remainder = divmod(length, BUFSIZE)
Guido van Rossum805365e2007-05-07 22:24:25 +0000263 for b in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000264 buf = src.read(BUFSIZE)
265 if len(buf) < BUFSIZE:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000267 dst.write(buf)
268
269 if remainder != 0:
270 buf = src.read(remainder)
271 if len(buf) < remainder:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272 raise IOError("end of file reached")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000273 dst.write(buf)
274 return
275
276filemode_table = (
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000277 ((S_IFLNK, "l"),
278 (S_IFREG, "-"),
279 (S_IFBLK, "b"),
280 (S_IFDIR, "d"),
281 (S_IFCHR, "c"),
282 (S_IFIFO, "p")),
283
284 ((TUREAD, "r"),),
285 ((TUWRITE, "w"),),
286 ((TUEXEC|TSUID, "s"),
287 (TSUID, "S"),
288 (TUEXEC, "x")),
289
290 ((TGREAD, "r"),),
291 ((TGWRITE, "w"),),
292 ((TGEXEC|TSGID, "s"),
293 (TSGID, "S"),
294 (TGEXEC, "x")),
295
296 ((TOREAD, "r"),),
297 ((TOWRITE, "w"),),
298 ((TOEXEC|TSVTX, "t"),
299 (TSVTX, "T"),
300 (TOEXEC, "x"))
301)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000302
303def filemode(mode):
304 """Convert a file's mode to a string of the form
305 -rwxrwxrwx.
306 Used by TarFile.list()
307 """
Andrew M. Kuchling8bc462f2004-10-20 11:48:42 +0000308 perm = []
309 for table in filemode_table:
310 for bit, char in table:
311 if mode & bit == bit:
312 perm.append(char)
313 break
314 else:
315 perm.append("-")
316 return "".join(perm)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000317
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000318class TarError(Exception):
319 """Base exception."""
320 pass
321class ExtractError(TarError):
322 """General exception for extract errors."""
323 pass
324class ReadError(TarError):
325 """Exception for unreadble tar archives."""
326 pass
327class CompressionError(TarError):
328 """Exception for unavailable compression methods."""
329 pass
330class StreamError(TarError):
331 """Exception for unsupported operations on stream-like TarFiles."""
332 pass
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000333class HeaderError(TarError):
Lars Gustäbel9520a432009-11-22 18:48:49 +0000334 """Base exception for header errors."""
335 pass
336class EmptyHeaderError(HeaderError):
337 """Exception for empty headers."""
338 pass
339class TruncatedHeaderError(HeaderError):
340 """Exception for truncated headers."""
341 pass
342class EOFHeaderError(HeaderError):
343 """Exception for end of file headers."""
344 pass
345class InvalidHeaderError(HeaderError):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000346 """Exception for invalid headers."""
347 pass
Lars Gustäbel9520a432009-11-22 18:48:49 +0000348class SubsequentHeaderError(HeaderError):
349 """Exception for missing and invalid extended headers."""
350 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000351
352#---------------------------
353# internal stream interface
354#---------------------------
355class _LowLevelFile:
356 """Low-level file object. Supports reading and writing.
357 It is used instead of a regular file object for streaming
358 access.
359 """
360
361 def __init__(self, name, mode):
362 mode = {
363 "r": os.O_RDONLY,
364 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
365 }[mode]
366 if hasattr(os, "O_BINARY"):
367 mode |= os.O_BINARY
Lars Gustäbeld6eb70b2010-04-29 15:37:02 +0000368 self.fd = os.open(name, mode, 0o666)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000369
370 def close(self):
371 os.close(self.fd)
372
373 def read(self, size):
374 return os.read(self.fd, size)
375
376 def write(self, s):
377 os.write(self.fd, s)
378
379class _Stream:
380 """Class that serves as an adapter between TarFile and
381 a stream-like object. The stream-like object only
382 needs to have a read() or write() method and is accessed
383 blockwise. Use of gzip or bzip2 compression is possible.
384 A stream-like object could be for example: sys.stdin,
385 sys.stdout, a socket, a tape device etc.
386
387 _Stream is intended to be used only internally.
388 """
389
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000390 def __init__(self, name, mode, comptype, fileobj, bufsize):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000391 """Construct a _Stream object.
392 """
393 self._extfileobj = True
394 if fileobj is None:
395 fileobj = _LowLevelFile(name, mode)
396 self._extfileobj = False
397
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000398 if comptype == '*':
399 # Enable transparent compression detection for the
400 # stream interface
401 fileobj = _StreamProxy(fileobj)
402 comptype = fileobj.getcomptype()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000403
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000404 self.name = name or ""
405 self.mode = mode
406 self.comptype = comptype
407 self.fileobj = fileobj
408 self.bufsize = bufsize
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000409 self.buf = b""
Guido van Rossume2a383d2007-01-15 16:59:06 +0000410 self.pos = 0
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000411 self.closed = False
412
Antoine Pitrou605c2932010-09-23 20:15:14 +0000413 try:
414 if comptype == "gz":
415 try:
416 import zlib
417 except ImportError:
418 raise CompressionError("zlib module is not available")
419 self.zlib = zlib
420 self.crc = zlib.crc32(b"")
421 if mode == "r":
422 self._init_read_gz()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100423 self.exception = zlib.error
Antoine Pitrou605c2932010-09-23 20:15:14 +0000424 else:
425 self._init_write_gz()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000426
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100427 elif comptype == "bz2":
Antoine Pitrou605c2932010-09-23 20:15:14 +0000428 try:
429 import bz2
430 except ImportError:
431 raise CompressionError("bz2 module is not available")
432 if mode == "r":
433 self.dbuf = b""
434 self.cmp = bz2.BZ2Decompressor()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100435 self.exception = IOError
Antoine Pitrou605c2932010-09-23 20:15:14 +0000436 else:
437 self.cmp = bz2.BZ2Compressor()
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100438
439 elif comptype == "xz":
440 try:
441 import lzma
442 except ImportError:
443 raise CompressionError("lzma module is not available")
444 if mode == "r":
445 self.dbuf = b""
446 self.cmp = lzma.LZMADecompressor()
447 self.exception = lzma.LZMAError
448 else:
449 self.cmp = lzma.LZMACompressor()
450
451 elif comptype != "tar":
452 raise CompressionError("unknown compression type %r" % comptype)
453
Antoine Pitrou605c2932010-09-23 20:15:14 +0000454 except:
455 if not self._extfileobj:
456 self.fileobj.close()
457 self.closed = True
458 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000459
460 def __del__(self):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000461 if hasattr(self, "closed") and not self.closed:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000462 self.close()
463
464 def _init_write_gz(self):
465 """Initialize for writing with gzip compression.
466 """
467 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
468 -self.zlib.MAX_WBITS,
469 self.zlib.DEF_MEM_LEVEL,
470 0)
Guido van Rossume2a383d2007-01-15 16:59:06 +0000471 timestamp = struct.pack("<L", int(time.time()))
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000472 self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000473 if self.name.endswith(".gz"):
474 self.name = self.name[:-3]
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000475 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
476 self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000477
478 def write(self, s):
479 """Write string s to the stream.
480 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000481 if self.comptype == "gz":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000482 self.crc = self.zlib.crc32(s, self.crc)
483 self.pos += len(s)
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000484 if self.comptype != "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000485 s = self.cmp.compress(s)
486 self.__write(s)
487
488 def __write(self, s):
489 """Write string s to the stream if a whole new block
490 is ready to be written.
491 """
492 self.buf += s
493 while len(self.buf) > self.bufsize:
494 self.fileobj.write(self.buf[:self.bufsize])
495 self.buf = self.buf[self.bufsize:]
496
497 def close(self):
498 """Close the _Stream object. No operation should be
499 done on it afterwards.
500 """
501 if self.closed:
502 return
503
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000504 if self.mode == "w" and self.comptype != "tar":
Martin v. Löwisc234a522004-08-22 21:28:33 +0000505 self.buf += self.cmp.flush()
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000506
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000507 if self.mode == "w" and self.buf:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000508 self.fileobj.write(self.buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000509 self.buf = b""
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000510 if self.comptype == "gz":
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000511 # The native zlib crc is an unsigned 32-bit integer, but
512 # the Python wrapper implicitly casts that to a signed C
513 # long. So, on a 32-bit box self.crc may "look negative",
514 # while the same crc on a 64-bit box may "look positive".
515 # To avoid irksome warnings from the `struct` module, force
516 # it to look positive on all boxes.
Guido van Rossume2a383d2007-01-15 16:59:06 +0000517 self.fileobj.write(struct.pack("<L", self.crc & 0xffffffff))
518 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000519
520 if not self._extfileobj:
521 self.fileobj.close()
522
523 self.closed = True
524
525 def _init_read_gz(self):
526 """Initialize for reading a gzip compressed fileobj.
527 """
528 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000529 self.dbuf = b""
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000530
531 # taken from gzip.GzipFile with some alterations
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000532 if self.__read(2) != b"\037\213":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000533 raise ReadError("not a gzip file")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000534 if self.__read(1) != b"\010":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535 raise CompressionError("unsupported compression method")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000536
537 flag = ord(self.__read(1))
538 self.__read(6)
539
540 if flag & 4:
541 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
542 self.read(xlen)
543 if flag & 8:
544 while True:
545 s = self.__read(1)
546 if not s or s == NUL:
547 break
548 if flag & 16:
549 while True:
550 s = self.__read(1)
551 if not s or s == NUL:
552 break
553 if flag & 2:
554 self.__read(2)
555
556 def tell(self):
557 """Return the stream's file pointer position.
558 """
559 return self.pos
560
561 def seek(self, pos=0):
562 """Set the stream's file pointer to pos. Negative seeking
563 is forbidden.
564 """
565 if pos - self.pos >= 0:
566 blocks, remainder = divmod(pos - self.pos, self.bufsize)
Guido van Rossum805365e2007-05-07 22:24:25 +0000567 for i in range(blocks):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000568 self.read(self.bufsize)
569 self.read(remainder)
570 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000571 raise StreamError("seeking backwards is not allowed")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000572 return self.pos
573
574 def read(self, size=None):
575 """Return the next size number of bytes from the stream.
576 If size is not defined, return all bytes of the stream
577 up to EOF.
578 """
579 if size is None:
580 t = []
581 while True:
582 buf = self._read(self.bufsize)
583 if not buf:
584 break
585 t.append(buf)
586 buf = "".join(t)
587 else:
588 buf = self._read(size)
589 self.pos += len(buf)
590 return buf
591
592 def _read(self, size):
593 """Return size bytes from the stream.
594 """
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000595 if self.comptype == "tar":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000596 return self.__read(size)
597
598 c = len(self.dbuf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000599 while c < size:
600 buf = self.__read(self.bufsize)
601 if not buf:
602 break
Guido van Rossumd8faa362007-04-27 19:54:29 +0000603 try:
604 buf = self.cmp.decompress(buf)
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100605 except self.exception:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000606 raise ReadError("invalid compressed data")
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000607 self.dbuf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000608 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000609 buf = self.dbuf[:size]
610 self.dbuf = self.dbuf[size:]
611 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000612
613 def __read(self, size):
614 """Return size bytes from stream. If internal buffer is empty,
615 read another block from the stream.
616 """
617 c = len(self.buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000618 while c < size:
619 buf = self.fileobj.read(self.bufsize)
620 if not buf:
621 break
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000622 self.buf += buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000623 c += len(buf)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000624 buf = self.buf[:size]
625 self.buf = self.buf[size:]
626 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000627# class _Stream
628
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000629class _StreamProxy(object):
630 """Small proxy class that enables transparent compression
631 detection for the Stream interface (mode 'r|*').
632 """
633
634 def __init__(self, fileobj):
635 self.fileobj = fileobj
636 self.buf = self.fileobj.read(BLOCKSIZE)
637
638 def read(self, size):
639 self.read = self.fileobj.read
640 return self.buf
641
642 def getcomptype(self):
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100643 if self.buf.startswith(b"\x1f\x8b\x08"):
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000644 return "gz"
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100645 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000646 return "bz2"
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +0100647 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
648 return "xz"
649 else:
650 return "tar"
Martin v. Löwis78be7df2005-03-05 12:47:42 +0000651
652 def close(self):
653 self.fileobj.close()
654# class StreamProxy
655
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000656#------------------------
657# Extraction file object
658#------------------------
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000659class _FileInFile(object):
660 """A thin wrapper around an existing file object that
661 provides a part of its data as an individual file
662 object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000663 """
664
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000665 def __init__(self, fileobj, offset, size, blockinfo=None):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000666 self.fileobj = fileobj
667 self.offset = offset
668 self.size = size
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000669 self.position = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000670
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000671 if blockinfo is None:
672 blockinfo = [(0, size)]
673
674 # Construct a map with data and zero blocks.
675 self.map_index = 0
676 self.map = []
677 lastpos = 0
678 realpos = self.offset
679 for offset, size in blockinfo:
680 if offset > lastpos:
681 self.map.append((False, lastpos, offset, None))
682 self.map.append((True, offset, offset + size, realpos))
683 realpos += size
684 lastpos = offset + size
685 if lastpos < self.size:
686 self.map.append((False, lastpos, self.size, None))
687
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000688 def seekable(self):
689 if not hasattr(self.fileobj, "seekable"):
690 # XXX gzip.GzipFile and bz2.BZ2File
691 return True
692 return self.fileobj.seekable()
693
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000694 def tell(self):
695 """Return the current file position.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000696 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000697 return self.position
698
699 def seek(self, position):
700 """Seek to a position in the file.
701 """
702 self.position = position
703
704 def read(self, size=None):
705 """Read data from the file.
706 """
707 if size is None:
708 size = self.size - self.position
709 else:
710 size = min(size, self.size - self.position)
711
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000712 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000713 while size > 0:
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000714 while True:
715 data, start, stop, offset = self.map[self.map_index]
716 if start <= self.position < stop:
717 break
718 else:
719 self.map_index += 1
720 if self.map_index == len(self.map):
721 self.map_index = 0
722 length = min(size, stop - self.position)
723 if data:
Lars Gustäbeldd071042011-02-23 11:42:22 +0000724 self.fileobj.seek(offset + (self.position - start))
725 buf += self.fileobj.read(length)
Lars Gustäbel9cbdd752010-10-29 09:08:19 +0000726 else:
727 buf += NUL * length
728 size -= length
729 self.position += length
730 return buf
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000731#class _FileInFile
732
733
734class ExFileObject(object):
735 """File-like object for reading an archive member.
736 Is returned by TarFile.extractfile().
737 """
738 blocksize = 1024
739
740 def __init__(self, tarfile, tarinfo):
741 self.fileobj = _FileInFile(tarfile.fileobj,
742 tarinfo.offset_data,
743 tarinfo.size,
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000744 tarinfo.sparse)
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000745 self.name = tarinfo.name
746 self.mode = "r"
747 self.closed = False
748 self.size = tarinfo.size
749
750 self.position = 0
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000751 self.buffer = b""
752
753 def readable(self):
754 return True
755
756 def writable(self):
757 return False
758
759 def seekable(self):
760 return self.fileobj.seekable()
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000761
762 def read(self, size=None):
763 """Read at most size bytes from the file. If size is not
764 present or None, read all data until EOF is reached.
765 """
766 if self.closed:
767 raise ValueError("I/O operation on closed file")
768
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000769 buf = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000770 if self.buffer:
771 if size is None:
772 buf = self.buffer
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000773 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000774 else:
775 buf = self.buffer[:size]
776 self.buffer = self.buffer[size:]
777
778 if size is None:
779 buf += self.fileobj.read()
780 else:
781 buf += self.fileobj.read(size - len(buf))
782
783 self.position += len(buf)
784 return buf
785
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000786 # XXX TextIOWrapper uses the read1() method.
787 read1 = read
788
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000789 def readline(self, size=-1):
790 """Read one entire line from the file. If size is present
791 and non-negative, return a string with at most that
792 size, which may be an incomplete line.
793 """
794 if self.closed:
795 raise ValueError("I/O operation on closed file")
796
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000797 pos = self.buffer.find(b"\n") + 1
798 if pos == 0:
799 # no newline found.
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000800 while True:
801 buf = self.fileobj.read(self.blocksize)
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000802 self.buffer += buf
803 if not buf or b"\n" in buf:
804 pos = self.buffer.find(b"\n") + 1
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000805 if pos == 0:
806 # no newline found.
807 pos = len(self.buffer)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000808 break
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000809
810 if size != -1:
811 pos = min(size, pos)
812
813 buf = self.buffer[:pos]
814 self.buffer = self.buffer[pos:]
815 self.position += len(buf)
816 return buf
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000817
818 def readlines(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000819 """Return a list with all remaining lines.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000820 """
821 result = []
822 while True:
823 line = self.readline()
824 if not line: break
825 result.append(line)
826 return result
827
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000828 def tell(self):
829 """Return the current file position.
830 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000831 if self.closed:
832 raise ValueError("I/O operation on closed file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000833
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000834 return self.position
835
836 def seek(self, pos, whence=os.SEEK_SET):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000837 """Seek to a position in the file.
838 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000839 if self.closed:
840 raise ValueError("I/O operation on closed file")
841
842 if whence == os.SEEK_SET:
843 self.position = min(max(pos, 0), self.size)
844 elif whence == os.SEEK_CUR:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000845 if pos < 0:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000846 self.position = max(self.position + pos, 0)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000847 else:
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000848 self.position = min(self.position + pos, self.size)
849 elif whence == os.SEEK_END:
850 self.position = max(min(self.size + pos, self.size), 0)
851 else:
852 raise ValueError("Invalid argument")
853
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000854 self.buffer = b""
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000855 self.fileobj.seek(self.position)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000856
857 def close(self):
858 """Close the file object.
859 """
860 self.closed = True
Martin v. Löwisdf241532005-03-03 08:17:42 +0000861
862 def __iter__(self):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000863 """Get an iterator over the file's lines.
Martin v. Löwisdf241532005-03-03 08:17:42 +0000864 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000865 while True:
866 line = self.readline()
867 if not line:
868 break
869 yield line
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000870#class ExFileObject
871
872#------------------
873# Exported Classes
874#------------------
875class TarInfo(object):
876 """Informational class which holds the details about an
877 archive member given by a tar header block.
878 TarInfo objects are returned by TarFile.getmember(),
879 TarFile.getmembers() and TarFile.gettarinfo() and are
880 usually created internally.
881 """
882
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000883 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
884 "chksum", "type", "linkname", "uname", "gname",
885 "devmajor", "devminor",
886 "offset", "offset_data", "pax_headers", "sparse",
887 "tarfile", "_sparse_structs", "_link_target")
888
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000889 def __init__(self, name=""):
890 """Construct a TarInfo object. name is the optional name
891 of the member.
892 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000893 self.name = name # member name
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000894 self.mode = 0o644 # file permissions
Thomas Wouters477c8d52006-05-27 19:21:47 +0000895 self.uid = 0 # user id
896 self.gid = 0 # group id
897 self.size = 0 # file size
898 self.mtime = 0 # modification time
899 self.chksum = 0 # header checksum
900 self.type = REGTYPE # member type
901 self.linkname = "" # link name
Lars Gustäbel331b8002010-10-04 15:18:47 +0000902 self.uname = "" # user name
903 self.gname = "" # group name
Thomas Wouters477c8d52006-05-27 19:21:47 +0000904 self.devmajor = 0 # device major number
905 self.devminor = 0 # device minor number
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000906
Thomas Wouters477c8d52006-05-27 19:21:47 +0000907 self.offset = 0 # the tar header starts here
908 self.offset_data = 0 # the file's data starts here
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000909
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +0000910 self.sparse = None # sparse member information
Guido van Rossumd8faa362007-04-27 19:54:29 +0000911 self.pax_headers = {} # pax header information
912
913 # In pax headers the "name" and "linkname" field are called
914 # "path" and "linkpath".
915 def _getpath(self):
916 return self.name
917 def _setpath(self, name):
918 self.name = name
919 path = property(_getpath, _setpath)
920
921 def _getlinkpath(self):
922 return self.linkname
923 def _setlinkpath(self, linkname):
924 self.linkname = linkname
925 linkpath = property(_getlinkpath, _setlinkpath)
926
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +0000927 def __repr__(self):
928 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
929
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000930 def get_info(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000931 """Return the TarInfo's attributes as a dictionary.
932 """
933 info = {
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000934 "name": self.name,
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000935 "mode": self.mode & 0o7777,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000936 "uid": self.uid,
937 "gid": self.gid,
938 "size": self.size,
939 "mtime": self.mtime,
940 "chksum": self.chksum,
941 "type": self.type,
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +0000942 "linkname": self.linkname,
Guido van Rossumd8faa362007-04-27 19:54:29 +0000943 "uname": self.uname,
944 "gname": self.gname,
945 "devmajor": self.devmajor,
946 "devminor": self.devminor
947 }
948
949 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
950 info["name"] += "/"
951
952 return info
953
Victor Stinnerde629d42010-05-05 21:43:57 +0000954 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000955 """Return a tar header as a string of 512 byte blocks.
956 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000957 info = self.get_info()
Guido van Rossume7ba4952007-06-06 23:52:48 +0000958
Guido van Rossumd8faa362007-04-27 19:54:29 +0000959 if format == USTAR_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000960 return self.create_ustar_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000961 elif format == GNU_FORMAT:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000962 return self.create_gnu_header(info, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000963 elif format == PAX_FORMAT:
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000964 return self.create_pax_header(info, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000965 else:
966 raise ValueError("invalid format")
967
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000968 def create_ustar_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000969 """Return the object as a ustar header block.
970 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000971 info["magic"] = POSIX_MAGIC
972
973 if len(info["linkname"]) > LENGTH_LINK:
974 raise ValueError("linkname is too long")
975
976 if len(info["name"]) > LENGTH_NAME:
977 info["prefix"], info["name"] = self._posix_split_name(info["name"])
978
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000979 return self._create_header(info, USTAR_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000980
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000981 def create_gnu_header(self, info, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000982 """Return the object as a GNU header block sequence.
983 """
Guido van Rossumd8faa362007-04-27 19:54:29 +0000984 info["magic"] = GNU_MAGIC
985
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000986 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +0000987 if len(info["linkname"]) > LENGTH_LINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000988 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000989
990 if len(info["name"]) > LENGTH_NAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000991 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000992
Lars Gustäbelb506dc32007-08-07 18:36:16 +0000993 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000994
Lars Gustäbel1465cc22010-05-17 18:02:50 +0000995 def create_pax_header(self, info, encoding):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000996 """Return the object as a ustar header block. If it cannot be
997 represented this way, prepend a pax extended header sequence
998 with supplement information.
999 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001000 info["magic"] = POSIX_MAGIC
1001 pax_headers = self.pax_headers.copy()
1002
1003 # Test string fields for values that exceed the field length or cannot
1004 # be represented in ASCII encoding.
1005 for name, hname, length in (
1006 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
1007 ("uname", "uname", 32), ("gname", "gname", 32)):
1008
Guido van Rossume7ba4952007-06-06 23:52:48 +00001009 if hname in pax_headers:
1010 # The pax header has priority.
1011 continue
1012
Guido van Rossumd8faa362007-04-27 19:54:29 +00001013 # Try to encode the string as ASCII.
1014 try:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001015 info[name].encode("ascii", "strict")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001016 except UnicodeEncodeError:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001017 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001018 continue
1019
Guido van Rossume7ba4952007-06-06 23:52:48 +00001020 if len(info[name]) > length:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001021 pax_headers[hname] = info[name]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001022
1023 # Test number fields for values that exceed the field limit or values
1024 # that like to be stored as float.
1025 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001026 if name in pax_headers:
1027 # The pax header has priority. Avoid overflow.
1028 info[name] = 0
1029 continue
1030
Guido van Rossumd8faa362007-04-27 19:54:29 +00001031 val = info[name]
1032 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001033 pax_headers[name] = str(val)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001034 info[name] = 0
1035
Guido van Rossume7ba4952007-06-06 23:52:48 +00001036 # Create a pax extended header if necessary.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001037 if pax_headers:
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001038 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001039 else:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001040 buf = b""
Guido van Rossumd8faa362007-04-27 19:54:29 +00001041
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001042 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001043
1044 @classmethod
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001045 def create_pax_global_header(cls, pax_headers):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001046 """Return the object as a pax global header block sequence.
1047 """
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001048 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001049
1050 def _posix_split_name(self, name):
1051 """Split a name longer than 100 chars into a prefix
1052 and a name part.
1053 """
1054 prefix = name[:LENGTH_PREFIX + 1]
1055 while prefix and prefix[-1] != "/":
1056 prefix = prefix[:-1]
1057
1058 name = name[len(prefix):]
1059 prefix = prefix[:-1]
1060
1061 if not prefix or len(name) > LENGTH_NAME:
1062 raise ValueError("name is too long")
1063 return prefix, name
1064
1065 @staticmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001066 def _create_header(info, format, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001067 """Return a header block. info is a dictionary with file
1068 information, format must be one of the *_FORMAT constants.
1069 """
1070 parts = [
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001071 stn(info.get("name", ""), 100, encoding, errors),
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001072 itn(info.get("mode", 0) & 0o7777, 8, format),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001073 itn(info.get("uid", 0), 8, format),
1074 itn(info.get("gid", 0), 8, format),
1075 itn(info.get("size", 0), 12, format),
1076 itn(info.get("mtime", 0), 12, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001077 b" ", # checksum field
Guido van Rossumd8faa362007-04-27 19:54:29 +00001078 info.get("type", REGTYPE),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001079 stn(info.get("linkname", ""), 100, encoding, errors),
1080 info.get("magic", POSIX_MAGIC),
Lars Gustäbel331b8002010-10-04 15:18:47 +00001081 stn(info.get("uname", ""), 32, encoding, errors),
1082 stn(info.get("gname", ""), 32, encoding, errors),
Guido van Rossumd8faa362007-04-27 19:54:29 +00001083 itn(info.get("devmajor", 0), 8, format),
1084 itn(info.get("devminor", 0), 8, format),
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001085 stn(info.get("prefix", ""), 155, encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001086 ]
1087
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001088 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001089 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
Lars Gustäbela280ca752007-08-28 07:34:33 +00001090 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
Guido van Rossumd8faa362007-04-27 19:54:29 +00001091 return buf
1092
1093 @staticmethod
1094 def _create_payload(payload):
1095 """Return the string payload filled with zero bytes
1096 up to the next 512 byte border.
1097 """
1098 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1099 if remainder > 0:
1100 payload += (BLOCKSIZE - remainder) * NUL
1101 return payload
1102
1103 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001104 def _create_gnu_long_header(cls, name, type, encoding, errors):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001105 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1106 for name.
1107 """
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001108 name = name.encode(encoding, errors) + NUL
Guido van Rossumd8faa362007-04-27 19:54:29 +00001109
1110 info = {}
1111 info["name"] = "././@LongLink"
1112 info["type"] = type
1113 info["size"] = len(name)
1114 info["magic"] = GNU_MAGIC
1115
1116 # create extended header + name blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001117 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001118 cls._create_payload(name)
1119
1120 @classmethod
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001121 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1122 """Return a POSIX.1-2008 extended or global header sequence
Guido van Rossumd8faa362007-04-27 19:54:29 +00001123 that contains a list of keyword, value pairs. The values
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001124 must be strings.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001125 """
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001126 # Check if one of the fields contains surrogate characters and thereby
1127 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1128 binary = False
1129 for keyword, value in pax_headers.items():
1130 try:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001131 value.encode("utf-8", "strict")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001132 except UnicodeEncodeError:
1133 binary = True
1134 break
1135
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001136 records = b""
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001137 if binary:
1138 # Put the hdrcharset field at the beginning of the header.
1139 records += b"21 hdrcharset=BINARY\n"
1140
Guido van Rossumd8faa362007-04-27 19:54:29 +00001141 for keyword, value in pax_headers.items():
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001142 keyword = keyword.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001143 if binary:
1144 # Try to restore the original byte representation of `value'.
1145 # Needless to say, that the encoding must match the string.
1146 value = value.encode(encoding, "surrogateescape")
1147 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001148 value = value.encode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001149
Guido van Rossumd8faa362007-04-27 19:54:29 +00001150 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1151 n = p = 0
1152 while True:
1153 n = l + len(str(p))
1154 if n == p:
1155 break
1156 p = n
Lars Gustäbela280ca752007-08-28 07:34:33 +00001157 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
Guido van Rossumd8faa362007-04-27 19:54:29 +00001158
1159 # We use a hardcoded "././@PaxHeader" name like star does
1160 # instead of the one that POSIX recommends.
1161 info = {}
1162 info["name"] = "././@PaxHeader"
1163 info["type"] = type
1164 info["size"] = len(records)
1165 info["magic"] = POSIX_MAGIC
1166
1167 # Create pax header + record blocks.
Lars Gustäbel3741eff2007-08-21 12:17:05 +00001168 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
Guido van Rossumd8faa362007-04-27 19:54:29 +00001169 cls._create_payload(records)
1170
Guido van Rossum75b64e62005-01-16 00:16:11 +00001171 @classmethod
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001172 def frombuf(cls, buf, encoding, errors):
1173 """Construct a TarInfo object from a 512 byte bytes object.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001174 """
Lars Gustäbel9520a432009-11-22 18:48:49 +00001175 if len(buf) == 0:
1176 raise EmptyHeaderError("empty header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001177 if len(buf) != BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001178 raise TruncatedHeaderError("truncated header")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001179 if buf.count(NUL) == BLOCKSIZE:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001180 raise EOFHeaderError("end of file header")
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001181
1182 chksum = nti(buf[148:156])
1183 if chksum not in calc_chksums(buf):
Lars Gustäbel9520a432009-11-22 18:48:49 +00001184 raise InvalidHeaderError("bad checksum")
Thomas Wouters477c8d52006-05-27 19:21:47 +00001185
Guido van Rossumd8faa362007-04-27 19:54:29 +00001186 obj = cls()
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001187 obj.name = nts(buf[0:100], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001188 obj.mode = nti(buf[100:108])
1189 obj.uid = nti(buf[108:116])
1190 obj.gid = nti(buf[116:124])
1191 obj.size = nti(buf[124:136])
1192 obj.mtime = nti(buf[136:148])
1193 obj.chksum = chksum
1194 obj.type = buf[156:157]
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001195 obj.linkname = nts(buf[157:257], encoding, errors)
1196 obj.uname = nts(buf[265:297], encoding, errors)
1197 obj.gname = nts(buf[297:329], encoding, errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001198 obj.devmajor = nti(buf[329:337])
1199 obj.devminor = nti(buf[337:345])
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001200 prefix = nts(buf[345:500], encoding, errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001201
Guido van Rossumd8faa362007-04-27 19:54:29 +00001202 # Old V7 tar format represents a directory as a regular
1203 # file with a trailing slash.
1204 if obj.type == AREGTYPE and obj.name.endswith("/"):
1205 obj.type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001206
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001207 # The old GNU sparse format occupies some of the unused
1208 # space in the buffer for up to 4 sparse structures.
1209 # Save the them for later processing in _proc_sparse().
1210 if obj.type == GNUTYPE_SPARSE:
1211 pos = 386
1212 structs = []
1213 for i in range(4):
1214 try:
1215 offset = nti(buf[pos:pos + 12])
1216 numbytes = nti(buf[pos + 12:pos + 24])
1217 except ValueError:
1218 break
1219 structs.append((offset, numbytes))
1220 pos += 24
1221 isextended = bool(buf[482])
1222 origsize = nti(buf[483:495])
1223 obj._sparse_structs = (structs, isextended, origsize)
1224
Guido van Rossumd8faa362007-04-27 19:54:29 +00001225 # Remove redundant slashes from directories.
1226 if obj.isdir():
1227 obj.name = obj.name.rstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001228
Guido van Rossumd8faa362007-04-27 19:54:29 +00001229 # Reconstruct a ustar longname.
1230 if prefix and obj.type not in GNU_TYPES:
1231 obj.name = prefix + "/" + obj.name
1232 return obj
1233
1234 @classmethod
1235 def fromtarfile(cls, tarfile):
1236 """Return the next TarInfo object from TarFile object
1237 tarfile.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001238 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001239 buf = tarfile.fileobj.read(BLOCKSIZE)
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001240 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001241 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1242 return obj._proc_member(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001243
Guido van Rossumd8faa362007-04-27 19:54:29 +00001244 #--------------------------------------------------------------------------
1245 # The following are methods that are called depending on the type of a
1246 # member. The entry point is _proc_member() which can be overridden in a
1247 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1248 # implement the following
1249 # operations:
1250 # 1. Set self.offset_data to the position where the data blocks begin,
1251 # if there is data that follows.
1252 # 2. Set tarfile.offset to the position where the next member's header will
1253 # begin.
1254 # 3. Return self or another valid TarInfo object.
1255 def _proc_member(self, tarfile):
1256 """Choose the right processing method depending on
1257 the type and call it.
Thomas Wouters89f507f2006-12-13 04:49:30 +00001258 """
Guido van Rossumd8faa362007-04-27 19:54:29 +00001259 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1260 return self._proc_gnulong(tarfile)
1261 elif self.type == GNUTYPE_SPARSE:
1262 return self._proc_sparse(tarfile)
1263 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1264 return self._proc_pax(tarfile)
1265 else:
1266 return self._proc_builtin(tarfile)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001267
Guido van Rossumd8faa362007-04-27 19:54:29 +00001268 def _proc_builtin(self, tarfile):
1269 """Process a builtin type or an unknown type which
1270 will be treated as a regular file.
1271 """
1272 self.offset_data = tarfile.fileobj.tell()
1273 offset = self.offset_data
1274 if self.isreg() or self.type not in SUPPORTED_TYPES:
1275 # Skip the following data blocks.
1276 offset += self._block(self.size)
1277 tarfile.offset = offset
Thomas Wouters89f507f2006-12-13 04:49:30 +00001278
Guido van Rossume7ba4952007-06-06 23:52:48 +00001279 # Patch the TarInfo object with saved global
Guido van Rossumd8faa362007-04-27 19:54:29 +00001280 # header information.
Guido van Rossume7ba4952007-06-06 23:52:48 +00001281 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001282
1283 return self
1284
1285 def _proc_gnulong(self, tarfile):
1286 """Process the blocks that hold a GNU longname
1287 or longlink member.
1288 """
1289 buf = tarfile.fileobj.read(self._block(self.size))
1290
1291 # Fetch the next header and process it.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001292 try:
1293 next = self.fromtarfile(tarfile)
1294 except HeaderError:
1295 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001296
1297 # Patch the TarInfo object from the next header with
1298 # the longname information.
1299 next.offset = self.offset
1300 if self.type == GNUTYPE_LONGNAME:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001301 next.name = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001302 elif self.type == GNUTYPE_LONGLINK:
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001303 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001304
1305 return next
1306
1307 def _proc_sparse(self, tarfile):
1308 """Process a GNU sparse header plus extra headers.
1309 """
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001310 # We already collected some sparse structures in frombuf().
1311 structs, isextended, origsize = self._sparse_structs
1312 del self._sparse_structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001313
Lars Gustäbelc2ea8c62008-04-14 10:05:48 +00001314 # Collect sparse structures from extended header blocks.
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001315 while isextended:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001316 buf = tarfile.fileobj.read(BLOCKSIZE)
1317 pos = 0
Guido van Rossum805365e2007-05-07 22:24:25 +00001318 for i in range(21):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001319 try:
1320 offset = nti(buf[pos:pos + 12])
1321 numbytes = nti(buf[pos + 12:pos + 24])
1322 except ValueError:
1323 break
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001324 if offset and numbytes:
1325 structs.append((offset, numbytes))
Guido van Rossumd8faa362007-04-27 19:54:29 +00001326 pos += 24
Lars Gustäbelb506dc32007-08-07 18:36:16 +00001327 isextended = bool(buf[504])
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001328 self.sparse = structs
Guido van Rossumd8faa362007-04-27 19:54:29 +00001329
1330 self.offset_data = tarfile.fileobj.tell()
1331 tarfile.offset = self.offset_data + self._block(self.size)
1332 self.size = origsize
Guido van Rossumd8faa362007-04-27 19:54:29 +00001333 return self
1334
1335 def _proc_pax(self, tarfile):
1336 """Process an extended or global header as described in
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001337 POSIX.1-2008.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001338 """
1339 # Read the header information.
1340 buf = tarfile.fileobj.read(self._block(self.size))
1341
1342 # A pax header stores supplemental information for either
1343 # the following file (extended) or all following files
1344 # (global).
1345 if self.type == XGLTYPE:
1346 pax_headers = tarfile.pax_headers
1347 else:
1348 pax_headers = tarfile.pax_headers.copy()
1349
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001350 # Check if the pax header contains a hdrcharset field. This tells us
1351 # the encoding of the path, linkpath, uname and gname fields. Normally,
1352 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1353 # implementations are allowed to store them as raw binary strings if
1354 # the translation to UTF-8 fails.
1355 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1356 if match is not None:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001357 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001358
1359 # For the time being, we don't care about anything other than "BINARY".
1360 # The only other value that is currently allowed by the standard is
1361 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1362 hdrcharset = pax_headers.get("hdrcharset")
1363 if hdrcharset == "BINARY":
1364 encoding = tarfile.encoding
1365 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001366 encoding = "utf-8"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001367
Guido van Rossumd8faa362007-04-27 19:54:29 +00001368 # Parse pax header information. A record looks like that:
1369 # "%d %s=%s\n" % (length, keyword, value). length is the size
1370 # of the complete record including the length field itself and
Guido van Rossume7ba4952007-06-06 23:52:48 +00001371 # the newline. keyword and value are both UTF-8 encoded strings.
Antoine Pitroufd036452008-08-19 17:56:33 +00001372 regex = re.compile(br"(\d+) ([^=]+)=")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001373 pos = 0
1374 while True:
1375 match = regex.match(buf, pos)
1376 if not match:
1377 break
1378
1379 length, keyword = match.groups()
1380 length = int(length)
1381 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1382
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001383 # Normally, we could just use "utf-8" as the encoding and "strict"
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001384 # as the error handler, but we better not take the risk. For
1385 # example, GNU tar <= 1.23 is known to store filenames it cannot
1386 # translate to UTF-8 as raw strings (unfortunately without a
1387 # hdrcharset=BINARY header).
1388 # We first try the strict standard encoding, and if that fails we
1389 # fall back on the user's encoding and error handler.
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001390 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001391 tarfile.errors)
1392 if keyword in PAX_NAME_FIELDS:
1393 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1394 tarfile.errors)
1395 else:
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001396 value = self._decode_pax_field(value, "utf-8", "utf-8",
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001397 tarfile.errors)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001398
1399 pax_headers[keyword] = value
1400 pos += length
1401
Guido van Rossume7ba4952007-06-06 23:52:48 +00001402 # Fetch the next header.
Lars Gustäbel9520a432009-11-22 18:48:49 +00001403 try:
1404 next = self.fromtarfile(tarfile)
1405 except HeaderError:
1406 raise SubsequentHeaderError("missing or bad subsequent header")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001407
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001408 # Process GNU sparse information.
1409 if "GNU.sparse.map" in pax_headers:
1410 # GNU extended sparse format version 0.1.
1411 self._proc_gnusparse_01(next, pax_headers)
1412
1413 elif "GNU.sparse.size" in pax_headers:
1414 # GNU extended sparse format version 0.0.
1415 self._proc_gnusparse_00(next, pax_headers, buf)
1416
1417 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1418 # GNU extended sparse format version 1.0.
1419 self._proc_gnusparse_10(next, pax_headers, tarfile)
1420
Guido van Rossume7ba4952007-06-06 23:52:48 +00001421 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
Guido van Rossume7ba4952007-06-06 23:52:48 +00001422 # Patch the TarInfo object with the extended header info.
1423 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1424 next.offset = self.offset
1425
1426 if "size" in pax_headers:
1427 # If the extended header replaces the size field,
1428 # we need to recalculate the offset where the next
1429 # header starts.
1430 offset = next.offset_data
1431 if next.isreg() or next.type not in SUPPORTED_TYPES:
1432 offset += next._block(next.size)
1433 tarfile.offset = offset
1434
1435 return next
1436
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001437 def _proc_gnusparse_00(self, next, pax_headers, buf):
1438 """Process a GNU tar extended sparse header, version 0.0.
1439 """
1440 offsets = []
1441 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1442 offsets.append(int(match.group(1)))
1443 numbytes = []
1444 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1445 numbytes.append(int(match.group(1)))
1446 next.sparse = list(zip(offsets, numbytes))
1447
1448 def _proc_gnusparse_01(self, next, pax_headers):
1449 """Process a GNU tar extended sparse header, version 0.1.
1450 """
1451 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1452 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1453
1454 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1455 """Process a GNU tar extended sparse header, version 1.0.
1456 """
1457 fields = None
1458 sparse = []
1459 buf = tarfile.fileobj.read(BLOCKSIZE)
1460 fields, buf = buf.split(b"\n", 1)
1461 fields = int(fields)
1462 while len(sparse) < fields * 2:
1463 if b"\n" not in buf:
1464 buf += tarfile.fileobj.read(BLOCKSIZE)
1465 number, buf = buf.split(b"\n", 1)
1466 sparse.append(int(number))
1467 next.offset_data = tarfile.fileobj.tell()
1468 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1469
Guido van Rossume7ba4952007-06-06 23:52:48 +00001470 def _apply_pax_info(self, pax_headers, encoding, errors):
1471 """Replace fields with supplemental information from a previous
1472 pax extended or global header.
1473 """
1474 for keyword, value in pax_headers.items():
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001475 if keyword == "GNU.sparse.name":
1476 setattr(self, "path", value)
1477 elif keyword == "GNU.sparse.size":
1478 setattr(self, "size", int(value))
1479 elif keyword == "GNU.sparse.realsize":
1480 setattr(self, "size", int(value))
1481 elif keyword in PAX_FIELDS:
1482 if keyword in PAX_NUMBER_FIELDS:
1483 try:
1484 value = PAX_NUMBER_FIELDS[keyword](value)
1485 except ValueError:
1486 value = 0
1487 if keyword == "path":
1488 value = value.rstrip("/")
1489 setattr(self, keyword, value)
Guido van Rossume7ba4952007-06-06 23:52:48 +00001490
1491 self.pax_headers = pax_headers.copy()
Guido van Rossumd8faa362007-04-27 19:54:29 +00001492
Lars Gustäbel1465cc22010-05-17 18:02:50 +00001493 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1494 """Decode a single field from a pax record.
1495 """
1496 try:
1497 return value.decode(encoding, "strict")
1498 except UnicodeDecodeError:
1499 return value.decode(fallback_encoding, fallback_errors)
1500
Guido van Rossumd8faa362007-04-27 19:54:29 +00001501 def _block(self, count):
1502 """Round up a byte count by BLOCKSIZE and return it,
1503 e.g. _block(834) => 1024.
1504 """
1505 blocks, remainder = divmod(count, BLOCKSIZE)
1506 if remainder:
1507 blocks += 1
1508 return blocks * BLOCKSIZE
Thomas Wouters89f507f2006-12-13 04:49:30 +00001509
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001510 def isreg(self):
1511 return self.type in REGULAR_TYPES
1512 def isfile(self):
1513 return self.isreg()
1514 def isdir(self):
1515 return self.type == DIRTYPE
1516 def issym(self):
1517 return self.type == SYMTYPE
1518 def islnk(self):
1519 return self.type == LNKTYPE
1520 def ischr(self):
1521 return self.type == CHRTYPE
1522 def isblk(self):
1523 return self.type == BLKTYPE
1524 def isfifo(self):
1525 return self.type == FIFOTYPE
1526 def issparse(self):
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00001527 return self.sparse is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001528 def isdev(self):
1529 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1530# class TarInfo
1531
1532class TarFile(object):
1533 """The TarFile Class provides an interface to tar archives.
1534 """
1535
1536 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1537
1538 dereference = False # If true, add content of linked file to the
1539 # tar file, else the link.
1540
1541 ignore_zeros = False # If true, skips empty or invalid blocks and
1542 # continues processing.
1543
Lars Gustäbel365aff32009-12-13 11:42:29 +00001544 errorlevel = 1 # If 0, fatal errors only appear in debug
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001545 # messages (if debug >= 0). If > 0, errors
1546 # are passed to the caller as exceptions.
1547
Guido van Rossumd8faa362007-04-27 19:54:29 +00001548 format = DEFAULT_FORMAT # The format to use when creating an archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001549
Guido van Rossume7ba4952007-06-06 23:52:48 +00001550 encoding = ENCODING # Encoding for 8-bit character strings.
1551
1552 errors = None # Error handler for unicode conversion.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001553
Guido van Rossumd8faa362007-04-27 19:54:29 +00001554 tarinfo = TarInfo # The default TarInfo class to use.
1555
1556 fileobject = ExFileObject # The default ExFileObject class to use.
1557
1558 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1559 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
Victor Stinnerde629d42010-05-05 21:43:57 +00001560 errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001561 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1562 read from an existing archive, 'a' to append data to an existing
1563 file or 'w' to create a new file overwriting an existing one. `mode'
1564 defaults to 'r'.
1565 If `fileobj' is given, it is used for reading or writing data. If it
1566 can be determined, `mode' is overridden by `fileobj's mode.
1567 `fileobj' is not closed, when TarFile is closed.
1568 """
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001569 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001570 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001571 self.mode = mode
1572 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001573
1574 if not fileobj:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001575 if self.mode == "a" and not os.path.exists(name):
Thomas Wouterscf297e42007-02-23 15:07:44 +00001576 # Create nonexistent files in append mode.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001577 self.mode = "w"
1578 self._mode = "wb"
Guido van Rossume7ba4952007-06-06 23:52:48 +00001579 fileobj = bltn_open(name, self._mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001580 self._extfileobj = False
1581 else:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001582 if name is None and hasattr(fileobj, "name"):
1583 name = fileobj.name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001584 if hasattr(fileobj, "mode"):
Guido van Rossumd8faa362007-04-27 19:54:29 +00001585 self._mode = fileobj.mode
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001586 self._extfileobj = True
Thomas Woutersed03b412007-08-28 21:37:11 +00001587 self.name = os.path.abspath(name) if name else None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001588 self.fileobj = fileobj
1589
Guido van Rossumd8faa362007-04-27 19:54:29 +00001590 # Init attributes.
1591 if format is not None:
1592 self.format = format
1593 if tarinfo is not None:
1594 self.tarinfo = tarinfo
1595 if dereference is not None:
1596 self.dereference = dereference
1597 if ignore_zeros is not None:
1598 self.ignore_zeros = ignore_zeros
1599 if encoding is not None:
1600 self.encoding = encoding
Victor Stinnerde629d42010-05-05 21:43:57 +00001601 self.errors = errors
Guido van Rossume7ba4952007-06-06 23:52:48 +00001602
1603 if pax_headers is not None and self.format == PAX_FORMAT:
1604 self.pax_headers = pax_headers
1605 else:
1606 self.pax_headers = {}
1607
Guido van Rossumd8faa362007-04-27 19:54:29 +00001608 if debug is not None:
1609 self.debug = debug
1610 if errorlevel is not None:
1611 self.errorlevel = errorlevel
1612
1613 # Init datastructures.
Thomas Wouters477c8d52006-05-27 19:21:47 +00001614 self.closed = False
1615 self.members = [] # list of members as TarInfo objects
1616 self._loaded = False # flag if all members have been read
Christian Heimesd8654cf2007-12-02 15:22:16 +00001617 self.offset = self.fileobj.tell()
1618 # current position in the archive file
Thomas Wouters477c8d52006-05-27 19:21:47 +00001619 self.inodes = {} # dictionary caching the inodes of
1620 # archive members already added
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001621
Lars Gustäbel7b465392009-11-18 20:29:25 +00001622 try:
1623 if self.mode == "r":
1624 self.firstmember = None
1625 self.firstmember = self.next()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001626
Lars Gustäbel7b465392009-11-18 20:29:25 +00001627 if self.mode == "a":
1628 # Move to the end of the archive,
1629 # before the first empty block.
Lars Gustäbel7b465392009-11-18 20:29:25 +00001630 while True:
Lars Gustäbel9520a432009-11-22 18:48:49 +00001631 self.fileobj.seek(self.offset)
1632 try:
1633 tarinfo = self.tarinfo.fromtarfile(self)
1634 self.members.append(tarinfo)
1635 except EOFHeaderError:
1636 self.fileobj.seek(self.offset)
Lars Gustäbel7b465392009-11-18 20:29:25 +00001637 break
Lars Gustäbel9520a432009-11-22 18:48:49 +00001638 except HeaderError as e:
1639 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001640
Lars Gustäbel7b465392009-11-18 20:29:25 +00001641 if self.mode in "aw":
1642 self._loaded = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001643
Lars Gustäbel7b465392009-11-18 20:29:25 +00001644 if self.pax_headers:
1645 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1646 self.fileobj.write(buf)
1647 self.offset += len(buf)
1648 except:
1649 if not self._extfileobj:
1650 self.fileobj.close()
1651 self.closed = True
1652 raise
Guido van Rossumd8faa362007-04-27 19:54:29 +00001653
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001654 #--------------------------------------------------------------------------
1655 # Below are the classmethods which act as alternate constructors to the
1656 # TarFile class. The open() method is the only one that is needed for
1657 # public use; it is the "super"-constructor and is able to select an
1658 # adequate "sub"-constructor for a particular compression using the mapping
1659 # from OPEN_METH.
1660 #
1661 # This concept allows one to subclass TarFile without losing the comfort of
1662 # the super-constructor. A sub-constructor is registered and made available
1663 # by adding it to the mapping in OPEN_METH.
1664
Guido van Rossum75b64e62005-01-16 00:16:11 +00001665 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001666 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001667 """Open a tar archive for reading, writing or appending. Return
1668 an appropriate TarFile class.
1669
1670 mode:
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001671 'r' or 'r:*' open for reading with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001672 'r:' open for reading exclusively uncompressed
1673 'r:gz' open for reading with gzip compression
1674 'r:bz2' open for reading with bzip2 compression
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001675 'r:xz' open for reading with lzma compression
Thomas Wouterscf297e42007-02-23 15:07:44 +00001676 'a' or 'a:' open for appending, creating the file if necessary
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001677 'w' or 'w:' open for writing without compression
1678 'w:gz' open for writing with gzip compression
1679 'w:bz2' open for writing with bzip2 compression
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001680 'w:xz' open for writing with lzma compression
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001681
1682 'r|*' open a stream of tar blocks with transparent compression
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001683 'r|' open an uncompressed stream of tar blocks for reading
1684 'r|gz' open a gzip compressed stream of tar blocks
1685 'r|bz2' open a bzip2 compressed stream of tar blocks
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001686 'r|xz' open an lzma compressed stream of tar blocks
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001687 'w|' open an uncompressed stream for writing
1688 'w|gz' open a gzip compressed stream for writing
1689 'w|bz2' open a bzip2 compressed stream for writing
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001690 'w|xz' open an lzma compressed stream for writing
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001691 """
1692
1693 if not name and not fileobj:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001694 raise ValueError("nothing to open")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001695
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001696 if mode in ("r", "r:*"):
1697 # Find out which *open() is appropriate for opening the file.
1698 for comptype in cls.OPEN_METH:
1699 func = getattr(cls, cls.OPEN_METH[comptype])
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001700 if fileobj is not None:
1701 saved_pos = fileobj.tell()
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001702 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001703 return func(name, "r", fileobj, **kwargs)
1704 except (ReadError, CompressionError) as e:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001705 if fileobj is not None:
1706 fileobj.seek(saved_pos)
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001707 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001708 raise ReadError("file could not be opened successfully")
Martin v. Löwis78be7df2005-03-05 12:47:42 +00001709
1710 elif ":" in mode:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001711 filemode, comptype = mode.split(":", 1)
1712 filemode = filemode or "r"
1713 comptype = comptype or "tar"
1714
1715 # Select the *open() function according to
1716 # given compression.
1717 if comptype in cls.OPEN_METH:
1718 func = getattr(cls, cls.OPEN_METH[comptype])
1719 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001720 raise CompressionError("unknown compression type %r" % comptype)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001721 return func(name, filemode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001722
1723 elif "|" in mode:
1724 filemode, comptype = mode.split("|", 1)
1725 filemode = filemode or "r"
1726 comptype = comptype or "tar"
1727
1728 if filemode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001729 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001730
Antoine Pitrou605c2932010-09-23 20:15:14 +00001731 stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1732 try:
1733 t = cls(name, filemode, stream, **kwargs)
1734 except:
1735 stream.close()
1736 raise
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001737 t._extfileobj = False
1738 return t
1739
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001740 elif mode in "aw":
Guido van Rossumd8faa362007-04-27 19:54:29 +00001741 return cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001742
Thomas Wouters477c8d52006-05-27 19:21:47 +00001743 raise ValueError("undiscernible mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001744
Guido van Rossum75b64e62005-01-16 00:16:11 +00001745 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001746 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001747 """Open uncompressed tar archive name for reading or writing.
1748 """
1749 if len(mode) > 1 or mode not in "raw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001750 raise ValueError("mode must be 'r', 'a' or 'w'")
Guido van Rossumd8faa362007-04-27 19:54:29 +00001751 return cls(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001752
Guido van Rossum75b64e62005-01-16 00:16:11 +00001753 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001754 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001755 """Open gzip compressed tar archive name for reading or writing.
1756 Appending is not allowed.
1757 """
1758 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001759 raise ValueError("mode must be 'r' or 'w'")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001760
1761 try:
1762 import gzip
Neal Norwitz4ec68242003-04-11 03:05:56 +00001763 gzip.GzipFile
1764 except (ImportError, AttributeError):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001765 raise CompressionError("gzip module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001766
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001767 extfileobj = fileobj is not None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001768 try:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001769 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
1770 t = cls.taropen(name, mode, fileobj, **kwargs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001771 except IOError:
Georg Brandl3abb3722011-08-13 11:48:12 +02001772 if not extfileobj and fileobj is not None:
Antoine Pitrou95f55602010-09-23 18:36:46 +00001773 fileobj.close()
Georg Brandl3abb3722011-08-13 11:48:12 +02001774 if fileobj is None:
1775 raise
Thomas Wouters477c8d52006-05-27 19:21:47 +00001776 raise ReadError("not a gzip file")
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001777 except:
Georg Brandl3abb3722011-08-13 11:48:12 +02001778 if not extfileobj and fileobj is not None:
Antoine Pitroue1eca4e2010-10-29 23:49:49 +00001779 fileobj.close()
1780 raise
Antoine Pitrou95f55602010-09-23 18:36:46 +00001781 t._extfileobj = extfileobj
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001782 return t
1783
Guido van Rossum75b64e62005-01-16 00:16:11 +00001784 @classmethod
Guido van Rossumd8faa362007-04-27 19:54:29 +00001785 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001786 """Open bzip2 compressed tar archive name for reading or writing.
1787 Appending is not allowed.
1788 """
1789 if len(mode) > 1 or mode not in "rw":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001790 raise ValueError("mode must be 'r' or 'w'.")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001791
1792 try:
1793 import bz2
1794 except ImportError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001795 raise CompressionError("bz2 module is not available")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001796
Lars Gustäbelbb44b732011-12-06 13:44:10 +01001797 fileobj = bz2.BZ2File(filename=name if fileobj is None else None,
1798 mode=mode, fileobj=fileobj, compresslevel=compresslevel)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001799
1800 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00001801 t = cls.taropen(name, mode, fileobj, **kwargs)
Lars Gustäbel9520a432009-11-22 18:48:49 +00001802 except (IOError, EOFError):
Antoine Pitrou95f55602010-09-23 18:36:46 +00001803 fileobj.close()
Thomas Wouters477c8d52006-05-27 19:21:47 +00001804 raise ReadError("not a bzip2 file")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001805 t._extfileobj = False
1806 return t
1807
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001808 @classmethod
1809 def xzopen(cls, name, mode="r", fileobj=None, preset=9, **kwargs):
1810 """Open lzma compressed tar archive name for reading or writing.
1811 Appending is not allowed.
1812 """
1813 if mode not in ("r", "w"):
1814 raise ValueError("mode must be 'r' or 'w'")
1815
1816 try:
1817 import lzma
1818 except ImportError:
1819 raise CompressionError("lzma module is not available")
1820
1821 if mode == "r":
1822 # LZMAFile complains about a preset argument in read mode.
1823 preset = None
1824
1825 fileobj = lzma.LZMAFile(filename=name if fileobj is None else None,
1826 mode=mode, fileobj=fileobj, preset=preset)
1827
1828 try:
1829 t = cls.taropen(name, mode, fileobj, **kwargs)
1830 except (lzma.LZMAError, EOFError):
1831 fileobj.close()
1832 raise ReadError("not an lzma file")
1833 t._extfileobj = False
1834 return t
1835
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001836 # All *open() methods are registered here.
1837 OPEN_METH = {
1838 "tar": "taropen", # uncompressed tar
1839 "gz": "gzopen", # gzip compressed tar
Lars Gustäbel0a9dd2f2011-12-10 20:38:14 +01001840 "bz2": "bz2open", # bzip2 compressed tar
1841 "xz": "xzopen" # lzma compressed tar
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001842 }
1843
1844 #--------------------------------------------------------------------------
1845 # The public methods which TarFile provides:
1846
1847 def close(self):
1848 """Close the TarFile. In write-mode, two finishing zero blocks are
1849 appended to the archive.
1850 """
1851 if self.closed:
1852 return
1853
Guido van Rossumd8faa362007-04-27 19:54:29 +00001854 if self.mode in "aw":
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001855 self.fileobj.write(NUL * (BLOCKSIZE * 2))
1856 self.offset += (BLOCKSIZE * 2)
1857 # fill up the end with zero-blocks
1858 # (like option -b20 for tar does)
1859 blocks, remainder = divmod(self.offset, RECORDSIZE)
1860 if remainder > 0:
1861 self.fileobj.write(NUL * (RECORDSIZE - remainder))
1862
1863 if not self._extfileobj:
1864 self.fileobj.close()
1865 self.closed = True
1866
1867 def getmember(self, name):
1868 """Return a TarInfo object for member `name'. If `name' can not be
1869 found in the archive, KeyError is raised. If a member occurs more
Mark Dickinson934896d2009-02-21 20:59:32 +00001870 than once in the archive, its last occurrence is assumed to be the
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001871 most up-to-date version.
1872 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001873 tarinfo = self._getmember(name)
1874 if tarinfo is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001875 raise KeyError("filename %r not found" % name)
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001876 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001877
1878 def getmembers(self):
1879 """Return the members of the archive as a list of TarInfo objects. The
1880 list has the same order as the members in the archive.
1881 """
1882 self._check()
1883 if not self._loaded: # if we want to obtain a list of
1884 self._load() # all members, we first have to
1885 # scan the whole archive.
1886 return self.members
1887
1888 def getnames(self):
1889 """Return the members of the archive as a list of their names. It has
1890 the same order as the list returned by getmembers().
1891 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00001892 return [tarinfo.name for tarinfo in self.getmembers()]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001893
1894 def gettarinfo(self, name=None, arcname=None, fileobj=None):
1895 """Create a TarInfo object for either the file `name' or the file
1896 object `fileobj' (using os.fstat on its file descriptor). You can
1897 modify some of the TarInfo's attributes before you add it using
1898 addfile(). If given, `arcname' specifies an alternative name for the
1899 file in the archive.
1900 """
1901 self._check("aw")
1902
1903 # When fileobj is given, replace name by
1904 # fileobj's real name.
1905 if fileobj is not None:
1906 name = fileobj.name
1907
1908 # Building the name of the member in the archive.
1909 # Backward slashes are converted to forward slashes,
1910 # Absolute paths are turned to relative paths.
1911 if arcname is None:
1912 arcname = name
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001913 drv, arcname = os.path.splitdrive(arcname)
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00001914 arcname = arcname.replace(os.sep, "/")
1915 arcname = arcname.lstrip("/")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001916
1917 # Now, fill the TarInfo object with
1918 # information specific for the file.
Guido van Rossumd8faa362007-04-27 19:54:29 +00001919 tarinfo = self.tarinfo()
1920 tarinfo.tarfile = self
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001921
1922 # Use os.stat or os.lstat, depending on platform
1923 # and if symlinks shall be resolved.
1924 if fileobj is None:
1925 if hasattr(os, "lstat") and not self.dereference:
1926 statres = os.lstat(name)
1927 else:
1928 statres = os.stat(name)
1929 else:
1930 statres = os.fstat(fileobj.fileno())
1931 linkname = ""
1932
1933 stmd = statres.st_mode
1934 if stat.S_ISREG(stmd):
1935 inode = (statres.st_ino, statres.st_dev)
Guido van Rossumd8faa362007-04-27 19:54:29 +00001936 if not self.dereference and statres.st_nlink > 1 and \
1937 inode in self.inodes and arcname != self.inodes[inode]:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001938 # Is it a hardlink to an already
1939 # archived file?
1940 type = LNKTYPE
1941 linkname = self.inodes[inode]
1942 else:
1943 # The inode is added only if its valid.
1944 # For win32 it is always 0.
1945 type = REGTYPE
1946 if inode[0]:
1947 self.inodes[inode] = arcname
1948 elif stat.S_ISDIR(stmd):
1949 type = DIRTYPE
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001950 elif stat.S_ISFIFO(stmd):
1951 type = FIFOTYPE
1952 elif stat.S_ISLNK(stmd):
1953 type = SYMTYPE
1954 linkname = os.readlink(name)
1955 elif stat.S_ISCHR(stmd):
1956 type = CHRTYPE
1957 elif stat.S_ISBLK(stmd):
1958 type = BLKTYPE
1959 else:
1960 return None
1961
1962 # Fill the TarInfo object with all
1963 # information we can get.
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001964 tarinfo.name = arcname
1965 tarinfo.mode = stmd
1966 tarinfo.uid = statres.st_uid
1967 tarinfo.gid = statres.st_gid
Lars Gustäbel2470ff12010-06-03 10:11:52 +00001968 if type == REGTYPE:
Martin v. Löwis61d77e02004-08-20 06:35:46 +00001969 tarinfo.size = statres.st_size
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001970 else:
Guido van Rossume2a383d2007-01-15 16:59:06 +00001971 tarinfo.size = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001972 tarinfo.mtime = statres.st_mtime
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00001973 tarinfo.type = type
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00001974 tarinfo.linkname = linkname
1975 if pwd:
1976 try:
1977 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1978 except KeyError:
1979 pass
1980 if grp:
1981 try:
1982 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1983 except KeyError:
1984 pass
1985
1986 if type in (CHRTYPE, BLKTYPE):
1987 if hasattr(os, "major") and hasattr(os, "minor"):
1988 tarinfo.devmajor = os.major(statres.st_rdev)
1989 tarinfo.devminor = os.minor(statres.st_rdev)
1990 return tarinfo
1991
1992 def list(self, verbose=True):
1993 """Print a table of contents to sys.stdout. If `verbose' is False, only
1994 the names of the members are printed. If it is True, an `ls -l'-like
1995 output is produced.
1996 """
1997 self._check()
1998
1999 for tarinfo in self:
2000 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002001 print(filemode(tarinfo.mode), end=' ')
2002 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2003 tarinfo.gname or tarinfo.gid), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002004 if tarinfo.ischr() or tarinfo.isblk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002005 print("%10s" % ("%d,%d" \
2006 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002007 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002008 print("%10d" % tarinfo.size, end=' ')
2009 print("%d-%02d-%02d %02d:%02d:%02d" \
2010 % time.localtime(tarinfo.mtime)[:6], end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002011
Guido van Rossumd8faa362007-04-27 19:54:29 +00002012 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002013
2014 if verbose:
2015 if tarinfo.issym():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002016 print("->", tarinfo.linkname, end=' ')
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002017 if tarinfo.islnk():
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002018 print("link to", tarinfo.linkname, end=' ')
2019 print()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002020
Raymond Hettingera63a3122011-01-26 20:34:14 +00002021 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002022 """Add the file `name' to the archive. `name' may be any type of file
2023 (directory, fifo, symbolic link, etc.). If given, `arcname'
2024 specifies an alternative name for the file in the archive.
2025 Directories are added recursively by default. This can be avoided by
Guido van Rossum486364b2007-06-30 05:01:58 +00002026 setting `recursive' to False. `exclude' is a function that should
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002027 return True for each filename to be excluded. `filter' is a function
2028 that expects a TarInfo object argument and returns the changed
2029 TarInfo object, if it returns None the TarInfo object will be
2030 excluded from the archive.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002031 """
2032 self._check("aw")
2033
2034 if arcname is None:
2035 arcname = name
2036
Guido van Rossum486364b2007-06-30 05:01:58 +00002037 # Exclude pathnames.
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002038 if exclude is not None:
2039 import warnings
2040 warnings.warn("use the filter argument instead",
2041 DeprecationWarning, 2)
2042 if exclude(name):
2043 self._dbg(2, "tarfile: Excluded %r" % name)
2044 return
Guido van Rossum486364b2007-06-30 05:01:58 +00002045
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002046 # Skip if somebody tries to archive the archive...
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002047 if self.name is not None and os.path.abspath(name) == self.name:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002048 self._dbg(2, "tarfile: Skipped %r" % name)
2049 return
2050
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002051 self._dbg(1, name)
2052
2053 # Create a TarInfo object from the file.
2054 tarinfo = self.gettarinfo(name, arcname)
2055
2056 if tarinfo is None:
2057 self._dbg(1, "tarfile: Unsupported type %r" % name)
2058 return
2059
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002060 # Change or exclude the TarInfo object.
2061 if filter is not None:
2062 tarinfo = filter(tarinfo)
2063 if tarinfo is None:
2064 self._dbg(2, "tarfile: Excluded %r" % name)
2065 return
2066
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002067 # Append the tar header and data to the archive.
2068 if tarinfo.isreg():
Guido van Rossume7ba4952007-06-06 23:52:48 +00002069 f = bltn_open(name, "rb")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002070 self.addfile(tarinfo, f)
2071 f.close()
2072
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002073 elif tarinfo.isdir():
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002074 self.addfile(tarinfo)
2075 if recursive:
2076 for f in os.listdir(name):
Lars Gustäbel049d2aa2009-09-12 10:44:00 +00002077 self.add(os.path.join(name, f), os.path.join(arcname, f),
Raymond Hettingera63a3122011-01-26 20:34:14 +00002078 recursive, exclude, filter=filter)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002079
Martin v. Löwis5dbdc592005-08-27 10:07:56 +00002080 else:
2081 self.addfile(tarinfo)
2082
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002083 def addfile(self, tarinfo, fileobj=None):
2084 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2085 given, tarinfo.size bytes are read from it and added to the archive.
2086 You can create TarInfo objects using gettarinfo().
2087 On Windows platforms, `fileobj' should always be opened with mode
2088 'rb' to avoid irritation about the file size.
2089 """
2090 self._check("aw")
2091
Thomas Wouters89f507f2006-12-13 04:49:30 +00002092 tarinfo = copy.copy(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002093
Guido van Rossume7ba4952007-06-06 23:52:48 +00002094 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002095 self.fileobj.write(buf)
2096 self.offset += len(buf)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002097
2098 # If there's data to follow, append it.
2099 if fileobj is not None:
2100 copyfileobj(fileobj, self.fileobj, tarinfo.size)
2101 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
2102 if remainder > 0:
2103 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2104 blocks += 1
2105 self.offset += blocks * BLOCKSIZE
2106
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002107 self.members.append(tarinfo)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002108
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002109 def extractall(self, path=".", members=None):
2110 """Extract all members from the archive to the current working
2111 directory and set owner, modification time and permissions on
2112 directories afterwards. `path' specifies a different directory
2113 to extract to. `members' is optional and must be a subset of the
2114 list returned by getmembers().
2115 """
2116 directories = []
2117
2118 if members is None:
2119 members = self
2120
2121 for tarinfo in members:
2122 if tarinfo.isdir():
Christian Heimes2202f872008-02-06 14:31:34 +00002123 # Extract directories with a safe mode.
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002124 directories.append(tarinfo)
Christian Heimes2202f872008-02-06 14:31:34 +00002125 tarinfo = copy.copy(tarinfo)
2126 tarinfo.mode = 0o700
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002127 # Do not set_attrs directories, as we will do that further down
2128 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002129
2130 # Reverse sort directories.
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +00002131 directories.sort(key=lambda a: a.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002132 directories.reverse()
2133
2134 # Set correct owner, mtime and filemode on directories.
2135 for tarinfo in directories:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002136 dirpath = os.path.join(path, tarinfo.name)
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002137 try:
Christian Heimesfaf2f632008-01-06 16:59:19 +00002138 self.chown(tarinfo, dirpath)
2139 self.utime(tarinfo, dirpath)
2140 self.chmod(tarinfo, dirpath)
Guido van Rossumb940e112007-01-10 16:19:56 +00002141 except ExtractError as e:
Martin v. Löwis00a73e72005-03-04 19:40:34 +00002142 if self.errorlevel > 1:
2143 raise
2144 else:
2145 self._dbg(1, "tarfile: %s" % e)
2146
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002147 def extract(self, member, path="", set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002148 """Extract a member from the archive to the current working directory,
2149 using its full name. Its file information is extracted as accurately
2150 as possible. `member' may be a filename or a TarInfo object. You can
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002151 specify a different directory using `path'. File attributes (owner,
2152 mtime, mode) are set unless `set_attrs' is False.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002153 """
2154 self._check("r")
2155
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002156 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002157 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002158 else:
2159 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002160
Neal Norwitza4f651a2004-07-20 22:07:44 +00002161 # Prepare the link target for makelink().
2162 if tarinfo.islnk():
2163 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2164
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002165 try:
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002166 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2167 set_attrs=set_attrs)
Guido van Rossumb940e112007-01-10 16:19:56 +00002168 except EnvironmentError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002169 if self.errorlevel > 0:
2170 raise
2171 else:
2172 if e.filename is None:
2173 self._dbg(1, "tarfile: %s" % e.strerror)
2174 else:
2175 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
Guido van Rossumb940e112007-01-10 16:19:56 +00002176 except ExtractError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002177 if self.errorlevel > 1:
2178 raise
2179 else:
2180 self._dbg(1, "tarfile: %s" % e)
2181
2182 def extractfile(self, member):
2183 """Extract a member from the archive as a file object. `member' may be
2184 a filename or a TarInfo object. If `member' is a regular file, a
2185 file-like object is returned. If `member' is a link, a file-like
2186 object is constructed from the link's target. If `member' is none of
2187 the above, None is returned.
2188 The file-like object is read-only and provides the following
2189 methods: read(), readline(), readlines(), seek() and tell()
2190 """
2191 self._check("r")
2192
Guido van Rossum3172c5d2007-10-16 18:12:55 +00002193 if isinstance(member, str):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002194 tarinfo = self.getmember(member)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002195 else:
2196 tarinfo = member
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002197
2198 if tarinfo.isreg():
2199 return self.fileobject(self, tarinfo)
2200
2201 elif tarinfo.type not in SUPPORTED_TYPES:
2202 # If a member's type is unknown, it is treated as a
2203 # regular file.
2204 return self.fileobject(self, tarinfo)
2205
2206 elif tarinfo.islnk() or tarinfo.issym():
2207 if isinstance(self.fileobj, _Stream):
2208 # A small but ugly workaround for the case that someone tries
2209 # to extract a (sym)link as a file-object from a non-seekable
2210 # stream of tar blocks.
Thomas Wouters477c8d52006-05-27 19:21:47 +00002211 raise StreamError("cannot extract (sym)link as file object")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002212 else:
Georg Brandl7eb4b7d2005-07-22 21:49:32 +00002213 # A (sym)link's file object is its target's file object.
Lars Gustäbel1b512722010-06-03 12:45:16 +00002214 return self.extractfile(self._find_link_target(tarinfo))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002215 else:
2216 # If there's no data associated with the member (directory, chrdev,
2217 # blkdev, etc.), return None instead of a file object.
2218 return None
2219
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002220 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002221 """Extract the TarInfo object tarinfo to a physical
2222 file called targetpath.
2223 """
2224 # Fetch the TarInfo object for the given name
2225 # and build the destination pathname, replacing
2226 # forward slashes to platform specific separators.
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002227 targetpath = targetpath.rstrip("/")
2228 targetpath = targetpath.replace("/", os.sep)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002229
2230 # Create all upper directories.
2231 upperdirs = os.path.dirname(targetpath)
2232 if upperdirs and not os.path.exists(upperdirs):
Christian Heimes2202f872008-02-06 14:31:34 +00002233 # Create directories that are not part of the archive with
2234 # default permissions.
Thomas Woutersb2137042007-02-01 18:02:27 +00002235 os.makedirs(upperdirs)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002236
2237 if tarinfo.islnk() or tarinfo.issym():
2238 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2239 else:
2240 self._dbg(1, tarinfo.name)
2241
2242 if tarinfo.isreg():
2243 self.makefile(tarinfo, targetpath)
2244 elif tarinfo.isdir():
2245 self.makedir(tarinfo, targetpath)
2246 elif tarinfo.isfifo():
2247 self.makefifo(tarinfo, targetpath)
2248 elif tarinfo.ischr() or tarinfo.isblk():
2249 self.makedev(tarinfo, targetpath)
2250 elif tarinfo.islnk() or tarinfo.issym():
2251 self.makelink(tarinfo, targetpath)
2252 elif tarinfo.type not in SUPPORTED_TYPES:
2253 self.makeunknown(tarinfo, targetpath)
2254 else:
2255 self.makefile(tarinfo, targetpath)
2256
Martin v. Löwis16f344d2010-11-01 21:39:13 +00002257 if set_attrs:
2258 self.chown(tarinfo, targetpath)
2259 if not tarinfo.issym():
2260 self.chmod(tarinfo, targetpath)
2261 self.utime(tarinfo, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002262
2263 #--------------------------------------------------------------------------
2264 # Below are the different file methods. They are called via
2265 # _extract_member() when extract() is called. They can be replaced in a
2266 # subclass to implement other functionality.
2267
2268 def makedir(self, tarinfo, targetpath):
2269 """Make a directory called targetpath.
2270 """
2271 try:
Christian Heimes2202f872008-02-06 14:31:34 +00002272 # Use a safe mode for the directory, the real mode is set
2273 # later in _extract_member().
2274 os.mkdir(targetpath, 0o700)
Florent Xicluna68f71a32011-10-28 16:06:23 +02002275 except FileExistsError:
2276 pass
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002277
2278 def makefile(self, tarinfo, targetpath):
2279 """Make a file called targetpath.
2280 """
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002281 source = self.fileobj
2282 source.seek(tarinfo.offset_data)
Guido van Rossume7ba4952007-06-06 23:52:48 +00002283 target = bltn_open(targetpath, "wb")
Lars Gustäbel9cbdd752010-10-29 09:08:19 +00002284 if tarinfo.sparse is not None:
2285 for offset, size in tarinfo.sparse:
2286 target.seek(offset)
2287 copyfileobj(source, target, size)
2288 else:
2289 copyfileobj(source, target, tarinfo.size)
2290 target.seek(tarinfo.size)
2291 target.truncate()
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002292 target.close()
2293
2294 def makeunknown(self, tarinfo, targetpath):
2295 """Make a file from a TarInfo object with an unknown type
2296 at targetpath.
2297 """
2298 self.makefile(tarinfo, targetpath)
2299 self._dbg(1, "tarfile: Unknown file type %r, " \
2300 "extracted as regular file." % tarinfo.type)
2301
2302 def makefifo(self, tarinfo, targetpath):
2303 """Make a fifo called targetpath.
2304 """
2305 if hasattr(os, "mkfifo"):
2306 os.mkfifo(targetpath)
2307 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002308 raise ExtractError("fifo not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002309
2310 def makedev(self, tarinfo, targetpath):
2311 """Make a character or block device called targetpath.
2312 """
2313 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
Thomas Wouters477c8d52006-05-27 19:21:47 +00002314 raise ExtractError("special devices not supported by system")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002315
2316 mode = tarinfo.mode
2317 if tarinfo.isblk():
2318 mode |= stat.S_IFBLK
2319 else:
2320 mode |= stat.S_IFCHR
2321
2322 os.mknod(targetpath, mode,
2323 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2324
2325 def makelink(self, tarinfo, targetpath):
2326 """Make a (symbolic) link called targetpath. If it cannot be created
2327 (platform limitation), we try to make a copy of the referenced file
2328 instead of a link.
2329 """
Brian Curtind40e6f72010-07-08 21:39:08 +00002330 try:
Lars Gustäbel1b512722010-06-03 12:45:16 +00002331 # For systems that support symbolic and hard links.
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002332 if tarinfo.issym():
Lars Gustäbelbfdfdda2009-08-28 19:59:59 +00002333 os.symlink(tarinfo.linkname, targetpath)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002334 else:
Neal Norwitza4f651a2004-07-20 22:07:44 +00002335 # See extract().
Lars Gustäbel1b512722010-06-03 12:45:16 +00002336 if os.path.exists(tarinfo._link_target):
2337 os.link(tarinfo._link_target, targetpath)
2338 else:
Brian Curtin82df53e2010-09-24 21:04:05 +00002339 self._extract_member(self._find_link_target(tarinfo),
2340 targetpath)
Brian Curtin16633fa2010-07-09 13:54:27 +00002341 except symlink_exception:
Brian Curtind40e6f72010-07-08 21:39:08 +00002342 if tarinfo.issym():
Brian Curtin16633fa2010-07-09 13:54:27 +00002343 linkpath = os.path.join(os.path.dirname(tarinfo.name),
2344 tarinfo.linkname)
Brian Curtind40e6f72010-07-08 21:39:08 +00002345 else:
2346 linkpath = tarinfo.linkname
Lars Gustäbel1b512722010-06-03 12:45:16 +00002347 else:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002348 try:
Brian Curtin16633fa2010-07-09 13:54:27 +00002349 self._extract_member(self._find_link_target(tarinfo),
2350 targetpath)
Lars Gustäbel1b512722010-06-03 12:45:16 +00002351 except KeyError:
2352 raise ExtractError("unable to resolve link inside archive")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002353
2354 def chown(self, tarinfo, targetpath):
2355 """Set owner of targetpath according to tarinfo.
2356 """
2357 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2358 # We have to be root to do so.
2359 try:
2360 g = grp.getgrnam(tarinfo.gname)[2]
2361 except KeyError:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002362 g = tarinfo.gid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002363 try:
2364 u = pwd.getpwnam(tarinfo.uname)[2]
2365 except KeyError:
Lars Gustäbel2e7ddd32011-09-05 16:58:14 +02002366 u = tarinfo.uid
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002367 try:
2368 if tarinfo.issym() and hasattr(os, "lchown"):
2369 os.lchown(targetpath, u, g)
2370 else:
Andrew MacIntyre7970d202003-02-19 12:51:34 +00002371 if sys.platform != "os2emx":
2372 os.chown(targetpath, u, g)
Guido van Rossumb940e112007-01-10 16:19:56 +00002373 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002374 raise ExtractError("could not change owner")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002375
2376 def chmod(self, tarinfo, targetpath):
2377 """Set file permissions of targetpath according to tarinfo.
2378 """
Jack Jansen834eff62003-03-07 12:47:06 +00002379 if hasattr(os, 'chmod'):
2380 try:
2381 os.chmod(targetpath, tarinfo.mode)
Guido van Rossumb940e112007-01-10 16:19:56 +00002382 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002383 raise ExtractError("could not change mode")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002384
2385 def utime(self, tarinfo, targetpath):
2386 """Set modification time of targetpath according to tarinfo.
2387 """
Jack Jansen834eff62003-03-07 12:47:06 +00002388 if not hasattr(os, 'utime'):
Tim Petersf9347782003-03-07 15:36:41 +00002389 return
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002390 try:
2391 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
Guido van Rossumb940e112007-01-10 16:19:56 +00002392 except EnvironmentError as e:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002393 raise ExtractError("could not change modification time")
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002394
2395 #--------------------------------------------------------------------------
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002396 def next(self):
2397 """Return the next member of the archive as a TarInfo object, when
2398 TarFile is opened for reading. Return None if there is no more
2399 available.
2400 """
2401 self._check("ra")
2402 if self.firstmember is not None:
2403 m = self.firstmember
2404 self.firstmember = None
2405 return m
2406
2407 # Read the next block.
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002408 self.fileobj.seek(self.offset)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002409 tarinfo = None
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002410 while True:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002411 try:
Guido van Rossumd8faa362007-04-27 19:54:29 +00002412 tarinfo = self.tarinfo.fromtarfile(self)
Lars Gustäbel9520a432009-11-22 18:48:49 +00002413 except EOFHeaderError as e:
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002414 if self.ignore_zeros:
Thomas Wouters902d6eb2007-01-09 23:18:33 +00002415 self._dbg(2, "0x%X: %s" % (self.offset, e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002416 self.offset += BLOCKSIZE
2417 continue
Lars Gustäbel9520a432009-11-22 18:48:49 +00002418 except InvalidHeaderError as e:
2419 if self.ignore_zeros:
2420 self._dbg(2, "0x%X: %s" % (self.offset, e))
2421 self.offset += BLOCKSIZE
2422 continue
2423 elif self.offset == 0:
2424 raise ReadError(str(e))
2425 except EmptyHeaderError:
2426 if self.offset == 0:
2427 raise ReadError("empty file")
2428 except TruncatedHeaderError as e:
2429 if self.offset == 0:
2430 raise ReadError(str(e))
2431 except SubsequentHeaderError as e:
2432 raise ReadError(str(e))
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002433 break
2434
Lars Gustäbel9520a432009-11-22 18:48:49 +00002435 if tarinfo is not None:
2436 self.members.append(tarinfo)
2437 else:
2438 self._loaded = True
2439
Thomas Wouters477c8d52006-05-27 19:21:47 +00002440 return tarinfo
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002441
2442 #--------------------------------------------------------------------------
2443 # Little helper methods:
2444
Lars Gustäbel1b512722010-06-03 12:45:16 +00002445 def _getmember(self, name, tarinfo=None, normalize=False):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002446 """Find an archive member by name from bottom to top.
2447 If tarinfo is given, it is used as the starting point.
2448 """
Martin v. Löwisf3c56112004-09-18 09:08:52 +00002449 # Ensure that all members have been loaded.
2450 members = self.getmembers()
2451
Lars Gustäbel1b512722010-06-03 12:45:16 +00002452 # Limit the member search list up to tarinfo.
2453 if tarinfo is not None:
2454 members = members[:members.index(tarinfo)]
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002455
Lars Gustäbel1b512722010-06-03 12:45:16 +00002456 if normalize:
2457 name = os.path.normpath(name)
2458
2459 for member in reversed(members):
2460 if normalize:
2461 member_name = os.path.normpath(member.name)
2462 else:
2463 member_name = member.name
2464
2465 if name == member_name:
2466 return member
Andrew M. Kuchling864bba12004-07-10 22:02:11 +00002467
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002468 def _load(self):
2469 """Read through the entire archive file and look for readable
2470 members.
2471 """
2472 while True:
2473 tarinfo = self.next()
2474 if tarinfo is None:
2475 break
2476 self._loaded = True
2477
2478 def _check(self, mode=None):
2479 """Check if TarFile is still open, and if the operation's mode
2480 corresponds to TarFile's mode.
2481 """
2482 if self.closed:
Thomas Wouters477c8d52006-05-27 19:21:47 +00002483 raise IOError("%s is closed" % self.__class__.__name__)
Guido van Rossumd8faa362007-04-27 19:54:29 +00002484 if mode is not None and self.mode not in mode:
2485 raise IOError("bad operation for mode %r" % self.mode)
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002486
Lars Gustäbel1b512722010-06-03 12:45:16 +00002487 def _find_link_target(self, tarinfo):
2488 """Find the target member of a symlink or hardlink member in the
2489 archive.
2490 """
2491 if tarinfo.issym():
2492 # Always search the entire archive.
2493 linkname = os.path.dirname(tarinfo.name) + "/" + tarinfo.linkname
2494 limit = None
2495 else:
2496 # Search the archive before the link, because a hard link is
2497 # just a reference to an already archived file.
2498 linkname = tarinfo.linkname
2499 limit = tarinfo
2500
2501 member = self._getmember(linkname, tarinfo=limit, normalize=True)
2502 if member is None:
2503 raise KeyError("linkname %r not found" % linkname)
2504 return member
2505
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002506 def __iter__(self):
2507 """Provide an iterator object.
2508 """
2509 if self._loaded:
2510 return iter(self.members)
2511 else:
2512 return TarIter(self)
2513
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002514 def _dbg(self, level, msg):
2515 """Write debugging output to sys.stderr.
2516 """
2517 if level <= self.debug:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00002518 print(msg, file=sys.stderr)
Lars Gustäbel01385812010-03-03 12:08:54 +00002519
2520 def __enter__(self):
2521 self._check()
2522 return self
2523
2524 def __exit__(self, type, value, traceback):
2525 if type is None:
2526 self.close()
2527 else:
2528 # An exception occurred. We must not call close() because
2529 # it would try to write end-of-archive blocks and padding.
2530 if not self._extfileobj:
2531 self.fileobj.close()
2532 self.closed = True
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002533# class TarFile
2534
2535class TarIter:
2536 """Iterator Class.
2537
2538 for tarinfo in TarFile(...):
2539 suite...
2540 """
2541
2542 def __init__(self, tarfile):
2543 """Construct a TarIter object.
2544 """
2545 self.tarfile = tarfile
Martin v. Löwis637431b2005-03-03 23:12:42 +00002546 self.index = 0
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002547 def __iter__(self):
2548 """Return iterator object.
2549 """
2550 return self
Georg Brandla18af4e2007-04-21 15:47:16 +00002551 def __next__(self):
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002552 """Return the next item using TarFile's next() method.
2553 When all members have been read, set TarFile as _loaded.
2554 """
Martin v. Löwis637431b2005-03-03 23:12:42 +00002555 # Fix for SF #1100429: Under rare circumstances it can
2556 # happen that getmembers() is called during iteration,
2557 # which will cause TarIter to stop prematurely.
2558 if not self.tarfile._loaded:
2559 tarinfo = self.tarfile.next()
2560 if not tarinfo:
2561 self.tarfile._loaded = True
2562 raise StopIteration
2563 else:
2564 try:
2565 tarinfo = self.tarfile.members[self.index]
2566 except IndexError:
2567 raise StopIteration
2568 self.index += 1
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002569 return tarinfo
2570
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002571#--------------------
2572# exported functions
2573#--------------------
2574def is_tarfile(name):
2575 """Return True if name points to a tar archive that we
2576 are able to handle, else return False.
2577 """
2578 try:
2579 t = open(name)
2580 t.close()
2581 return True
2582 except TarError:
2583 return False
2584
Guido van Rossume7ba4952007-06-06 23:52:48 +00002585bltn_open = open
Neal Norwitzb9ef4ae2003-01-05 23:19:43 +00002586open = TarFile.open