Related to SF patch 618135: gzip.py and files > 2G.
Fixed the signed/unsigned confusions when dealing with files >= 2GB.
4GB is still a hard limitation of the gzip file format, though.
Testing this was a bitch on Win98SE due to frequent system freezes. It
didn't freeze while running gzip, it kept freezing while trying to *create*
a > 2GB test file! This wasn't Python's doing. I don't know of a
reasonable way to test this functionality in regrtest.py, so I'm not
checking in a test case (a test case would necessarily require creating
a 2GB+ file first, using gzip to zip it, using gzip to unzip it again,
and then compare before-and-after; so >4GB free space would be required,
and a loooong time; I did all this "by hand" once).
Bugfix candidate, I guess.
diff --git a/Lib/gzip.py b/Lib/gzip.py
index 55d448d..8802adb 100644
--- a/Lib/gzip.py
+++ b/Lib/gzip.py
@@ -15,12 +15,21 @@
READ, WRITE = 1, 2
+def U32(i):
+ """Return i as an unsigned integer, assuming it fits in 32 bits.
+
+ If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
+ """
+ if i < 0:
+ i += 1L << 32
+ return i
+
def write32(output, value):
output.write(struct.pack("<l", value))
def write32u(output, value):
- if value < 0:
- value = value + 0x100000000L
+ # The L format writes the bit pattern correctly whether signed
+ # or unsigned.
output.write(struct.pack("<L", value))
def read32(input):
@@ -157,19 +166,21 @@
if flag & FEXTRA:
# Read & discard the extra field, if present
- xlen=ord(self.fileobj.read(1))
- xlen=xlen+256*ord(self.fileobj.read(1))
+ xlen = ord(self.fileobj.read(1))
+ xlen = xlen + 256*ord(self.fileobj.read(1))
self.fileobj.read(xlen)
if flag & FNAME:
# Read and discard a null-terminated string containing the filename
while True:
- s=self.fileobj.read(1)
- if not s or s=='\000': break
+ s = self.fileobj.read(1)
+ if not s or s=='\000':
+ break
if flag & FCOMMENT:
# Read and discard a null-terminated string containing a comment
while True:
- s=self.fileobj.read(1)
- if not s or s=='\000': break
+ s = self.fileobj.read(1)
+ if not s or s=='\000':
+ break
if flag & FHCRC:
self.fileobj.read(2) # Read & discard the 16-bit header CRC
@@ -225,7 +236,8 @@
self.offset -= len(buf)
def _read(self, size=1024):
- if self.fileobj is None: raise EOFError, "Reached EOF"
+ if self.fileobj is None:
+ raise EOFError, "Reached EOF"
if self._new_member:
# If the _new_member flag is set, we have to
@@ -286,8 +298,8 @@
# uncompressed data matches the stored values.
self.fileobj.seek(-8, 1)
crc32 = read32(self.fileobj)
- isize = read32(self.fileobj)
- if crc32%0x100000000L != self.crc%0x100000000L:
+ isize = U32(read32(self.fileobj)) # may exceed 2GB
+ if U32(crc32) != U32(self.crc):
raise ValueError, "CRC check failed"
elif isize != self.size:
raise ValueError, "Incorrect length of data produced"
@@ -296,7 +308,8 @@
if self.mode == WRITE:
self.fileobj.write(self.compress.flush())
write32(self.fileobj, self.crc)
- write32(self.fileobj, self.size)
+ # self.size may exceed 2GB
+ write32u(self.fileobj, self.size)
self.fileobj = None
elif self.mode == READ:
self.fileobj = None
@@ -338,15 +351,16 @@
if offset < self.offset:
raise IOError('Negative seek in write mode')
count = offset - self.offset
- for i in range(count/1024):
- self.write(1024*'\0')
- self.write((count%1024)*'\0')
+ for i in range(count // 1024):
+ self.write(1024 * '\0')
+ self.write((count % 1024) * '\0')
elif self.mode == READ:
if offset < self.offset:
# for negative seek, rewind and do positive seek
self.rewind()
count = offset - self.offset
- for i in range(count/1024): self.read(1024)
+ for i in range(count // 1024):
+ self.read(1024)
self.read(count % 1024)
def readline(self, size=-1):
@@ -379,11 +393,13 @@
def readlines(self, sizehint=0):
# Negative numbers result in reading all the lines
- if sizehint <= 0: sizehint = sys.maxint
+ if sizehint <= 0:
+ sizehint = sys.maxint
L = []
while sizehint > 0:
line = self.readline()
- if line == "": break
+ if line == "":
+ break
L.append(line)
sizehint = sizehint - len(line)
diff --git a/Misc/NEWS b/Misc/NEWS
index c8334d7..980e4d2 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -355,6 +355,10 @@
Library
-------
+- gzip.py now handles files exceeding 2GB. Note that 4GB is still a
+ fundamental limitation of the underlying gzip file format (it only
+ has 32 bits to record the file size).
+
- xml.sax.saxutils.unescape has been added, to replace entity references
with their entity value.
@@ -365,7 +369,7 @@
- Various configure methods of Tkinter have been stream-lined, so that
tag_configure, image_configure, window_configure now return a
- dictionary when invoked with no argument.
+ dictionary when invoked with no argument.
- Importing the readline module now no longer has the side effect of
calling setlocale(LC_CTYPE, ""). The initial "C" locale, or