optimize ZSTDMT_compress() memory usage

does no longer allocate temporary buffers
when there is enough room in dstBuffer to decompress directly there.
(previous method would skip that for 1st chunk only).

Also : fix ZSTD_compressBound() for small srcSize
diff --git a/tests/zstreamtest.c b/tests/zstreamtest.c
index 370859c..10bcbe0 100644
--- a/tests/zstreamtest.c
+++ b/tests/zstreamtest.c
@@ -856,6 +856,7 @@
         /* some issues can only happen when reusing states */
         if ((FUZ_rand(&lseed) & 0xFF) == 131) {
             U32 const nbThreads = (FUZ_rand(&lseed) % 6) + 1;
+            DISPLAYLEVEL(5, "Creating new context with %u threads \n", nbThreads);
             ZSTDMT_freeCCtx(zc);
             zc = ZSTDMT_createCCtx(nbThreads);
             resetAllowed=0;
@@ -946,7 +947,7 @@
                     outBuff.size = outBuff.pos + adjustedDstSize;
                     DISPLAYLEVEL(5, "Flushing into dst buffer of size %u \n", (U32)adjustedDstSize);
                     {   size_t const flushError = ZSTDMT_flushStream(zc, &outBuff);
-                        CHECK (ZSTD_isError(flushError), "flush error : %s", ZSTD_getErrorName(flushError));
+                        CHECK (ZSTD_isError(flushError), "ZSTDMT_flushStream error : %s", ZSTD_getErrorName(flushError));
             }   }   }
 
             /* final frame epilogue */
@@ -957,7 +958,7 @@
                     outBuff.size = outBuff.pos + adjustedDstSize;
                     DISPLAYLEVEL(5, "Ending into dst buffer of size %u \n", (U32)adjustedDstSize);
                     remainingToFlush = ZSTDMT_endStream(zc, &outBuff);
-                    CHECK (ZSTD_isError(remainingToFlush), "flush error : %s", ZSTD_getErrorName(remainingToFlush));
+                    CHECK (ZSTD_isError(remainingToFlush), "ZSTDMT_endStream error : %s", ZSTD_getErrorName(remainingToFlush));
                     DISPLAYLEVEL(5, "endStream : remainingToFlush : %u \n", (U32)remainingToFlush);
             }   }
             DISPLAYLEVEL(5, "Frame completed \n");