[ #403753 ] zlib decompress; uncontrollable memory usage

Mostly by Toby Dickenson and Titus Brown.

Add an optional argument to a decompression object's decompress()
method.  The argument specifies the maximum length of the return
value.  If the uncompressed data exceeds this length, the excess data
is stored as the unconsumed_tail attribute.  (Not to be confused with
unused_data, which is a separate issue.)

Difference from SF patch: Default value for unconsumed_tail is ""
rather than None.  It's simpler if the attribute is always a string.
diff --git a/Doc/lib/libzlib.tex b/Doc/lib/libzlib.tex
index e384b1f..b9726d7 100644
--- a/Doc/lib/libzlib.tex
+++ b/Doc/lib/libzlib.tex
@@ -120,7 +120,7 @@
 action is to delete the object.  
 \end{methoddesc}
 
-Decompression objects support the following methods, and a single attribute:
+Decompression objects support the following methods, and two attributes:
 
 \begin{memberdesc}{unused_data}
 A string which contains any unused data from the last string fed to
@@ -135,13 +135,27 @@
 no longer the empty string.  
 \end{memberdesc}
 
-\begin{methoddesc}[Decompress]{decompress}{string}
+\begin{memberdesc}{unconsumed_tail}
+A string that contains any data that was not consumed by the last
+\method{decompress} call because it exceeded the limit for the
+uncompressed data buffer.
+\end{memberdesc}
+
+\begin{methoddesc}[Decompress]{decompress}{string}{\optional{max_length}}
 Decompress \var{string}, returning a string containing the
 uncompressed data corresponding to at least part of the data in
 \var{string}.  This data should be concatenated to the output produced
 by any preceding calls to the
 \method{decompress()} method.  Some of the input data may be preserved
 in internal buffers for later processing.
+
+If the optional parameter \var{max_length} is supplied then the return value
+will be no longer than \var{max_length}. This may mean that not all of the
+compressed input can be processed; and unconsumed data will be stored
+in the attribute \member{unconsumed_tail}. This string must be passed
+to a subsequent call to \method{decompress()} if decompression is to
+continue.  If \var{max_length} is not supplied then the whole input is
+decompressed, and \member{unconsumed_tail} is an empty string.
 \end{methoddesc}
 
 \begin{methoddesc}[Decompress]{flush}{}
diff --git a/Lib/test/output/test_zlib b/Lib/test/output/test_zlib
index 61c33cf..1c2e2e9 100644
--- a/Lib/test/output/test_zlib
+++ b/Lib/test/output/test_zlib
@@ -8,4 +8,7 @@
 compress/decompression obj succeeded
 decompress with init options succeeded
 decompressobj with init options succeeded
+should be '': ''
+max_length decompressobj succeeded
+unconsumed_tail should be '': ''
 Testing on 17K of random data
diff --git a/Lib/test/test_zlib.py b/Lib/test/test_zlib.py
index 439db22..915f582 100644
--- a/Lib/test/test_zlib.py
+++ b/Lib/test/test_zlib.py
@@ -76,6 +76,36 @@
 else:
     print "decompressobj with init options succeeded"
 
+print "should be '':", `deco.unconsumed_tail`
+
+# Check a decompression object with max_length specified
+deco = zlib.decompressobj(-12)
+cb = combuf
+bufs = []
+while cb:
+    max_length = 1 + len(cb)/10
+    chunk = deco.decompress(cb, max_length)
+    if len(chunk) > max_length:
+        print 'chunk too big (%d>%d)' % (len(chunk),max_length)
+    bufs.append(chunk)
+    cb = deco.unconsumed_tail
+bufs.append(deco.flush())
+decomp2 = ''.join(buf)
+if decomp2 != buf:
+    print "max_length decompressobj failed"
+else:
+    print "max_length decompressobj succeeded"
+    
+# Misc tests of max_length
+deco = zlib.decompressobj(-12)
+try:
+    deco.decompress("", -1)
+except ValueError:
+    pass
+else:
+    print "failed to raise value error on bad max_length"
+print "unconsumed_tail should be '':", `deco.unconsumed_tail`
+
 # Test flush() with the various options, using all the different levels
 # in order to provide more variations.
 sync_opt = ['Z_NO_FLUSH', 'Z_SYNC_FLUSH', 'Z_FULL_FLUSH']
diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c
index a2e6aed..2d9e777 100644
--- a/Modules/zlibmodule.c
+++ b/Modules/zlibmodule.c
@@ -78,6 +78,7 @@
   PyObject_HEAD
   z_stream zst;
   PyObject *unused_data;
+  PyObject *unconsumed_tail;
   int is_initialised;
 } compobject;
 
@@ -100,6 +101,15 @@
                 return NULL;
 	self->is_initialised = 0;
 	self->unused_data = PyString_FromString("");
+	if (self->unused_data == NULL) {
+	    Py_DECREF(self);
+	    return NULL;
+	}
+	self->unconsumed_tail = PyString_FromString("");
+	if (self->unconsumed_tail == NULL) {
+	    Py_DECREF(self);
+	    return NULL;
+	}
         return self;
 }
 
@@ -485,6 +495,7 @@
     if (self->is_initialised)
       deflateEnd(&self->zst);
     Py_XDECREF(self->unused_data);
+    Py_XDECREF(self->unconsumed_tail);
     PyObject_Del(self);
 
     LEAVE_ZLIB
@@ -498,6 +509,7 @@
     if (self->is_initialised)
       inflateEnd(&self->zst);
     Py_XDECREF(self->unused_data);
+    Py_XDECREF(self->unconsumed_tail);
     PyObject_Del(self);
 
     LEAVE_ZLIB
@@ -595,27 +607,41 @@
 }
 
 static char decomp_decompress__doc__[] =
-"decompress(data) -- Return a string containing the decompressed version of the data.\n\n"
+"decompress(data, max_length) -- Return a string containing\n"
+"the decompressed version of the data.\n\n"
 "After calling this function, some of the input data may still\n"
 "be stored in internal buffers for later processing.\n"
-"Call the flush() method to clear these buffers."
+"Call the flush() method to clear these buffers.\n"
+"If the max_length parameter is specified then the return value will be\n"
+"no longer than max_length.  Unconsumed input data will be stored in\n"
+"the unconsumed_tail attribute."
 ;
 
 static PyObject *
 PyZlib_objdecompress(compobject *self, PyObject *args)
 {
-  int err, inplen, length = DEFAULTALLOC;
+  int err, inplen, old_length, length = DEFAULTALLOC;
+  int max_length = 0;
   PyObject *RetVal;
   Byte *input;
   unsigned long start_total_out;
   int return_error;
   PyObject * inputString;
 
-  if (!PyArg_ParseTuple(args, "S:decompress", &inputString))
+  if (!PyArg_ParseTuple(args, "S|i:decompress", &inputString, &max_length))
     return NULL;
+  if (max_length < 0) {
+    PyErr_SetString(PyExc_ValueError,
+		    "max_length must be greater than zero");
+    return NULL;
+  }
+
   if (PyString_AsStringAndSize(inputString, (char**)&input, &inplen) == -1)
     return NULL;
 
+  /* limit amount of data allocated to max_length */
+  if (max_length && length > max_length) 
+    length = max_length;
   if (!(RetVal = PyString_FromStringAndSize(NULL, length))) {
     PyErr_SetString(PyExc_MemoryError,
 		    "Can't allocate memory to compress data");
@@ -637,23 +663,46 @@
   err = inflate(&(self->zst), Z_SYNC_FLUSH);
   Py_END_ALLOW_THREADS
 
-  /* while Z_OK and the output buffer is full, there might be more output,
-    so extend the output buffer and try again */
+  /* While Z_OK and the output buffer is full, there might be more output.
+     So extend the output buffer and try again.
+  */
   while (err == Z_OK && self->zst.avail_out == 0) { 
-    if (_PyString_Resize(&RetVal, length << 1) == -1) {
+    /* If max_length set, don't continue decompressing if we've already
+        reached the limit.
+    */
+    if (max_length && length >= max_length)
+      break;
+
+    /* otherwise, ... */
+    old_length = length;
+    length = length << 1;
+    if (max_length && length > max_length) 
+      length = max_length;
+
+    if (_PyString_Resize(&RetVal, length) == -1) {
       PyErr_SetString(PyExc_MemoryError,
                       "Can't allocate memory to compress data");
       return_error = 1;
       break;
     }
-    self->zst.next_out = (unsigned char *)PyString_AsString(RetVal) + length;
-    self->zst.avail_out = length;
-    length = length << 1;
+    self->zst.next_out = (unsigned char *)PyString_AsString(RetVal)+old_length;
+    self->zst.avail_out = length - old_length;
+
     Py_BEGIN_ALLOW_THREADS
     err = inflate(&(self->zst), Z_SYNC_FLUSH);
     Py_END_ALLOW_THREADS
   }
 
+  /* Not all of the compressed data could be accomodated in the output buffer
+    of specified size. Return the unconsumed tail in an attribute.*/
+  if(max_length) {
+    Py_DECREF(self->unconsumed_tail);
+    self->unconsumed_tail = PyString_FromStringAndSize(self->zst.next_in, 
+						       self->zst.avail_in);
+    if(!self->unconsumed_tail)
+      return_error = 1;
+  }
+
   /* The end of the compressed data has been reached, so set the unused_data 
     attribute to a string containing the remainder of the data in the string. 
     Note that this is also a logical place to call inflateEnd, but the old
@@ -885,6 +934,11 @@
 	    Py_INCREF(self->unused_data);
             retval = self->unused_data;
 	  }
+	else if (strcmp(name, "unconsumed_tail") == 0) 
+	  {  
+	    Py_INCREF(self->unconsumed_tail);
+	    retval = self->unconsumed_tail;
+	  }
 	else 
 	  retval = Py_FindMethod(Decomp_methods, (PyObject *)self, name);