optimize ZSTDMT_compress() memory usage

does no longer allocate temporary buffers
when there is enough room in dstBuffer to decompress directly there.
(previous method would skip that for 1st chunk only).

Also : fix ZSTD_compressBound() for small srcSize
diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c
index cc69f11..e1c7349 100644
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@@ -33,7 +33,8 @@
 ***************************************/
 #define ZSTD_STATIC_ASSERT(c) { enum { ZSTD_static_assert = 1/(int)(!!(c)) }; }
 size_t ZSTD_compressBound(size_t srcSize) {
-    size_t const margin = (srcSize < 512 KB) ? 16 : 0;
+    size_t const lowLimit = 256 KB;
+    size_t const margin = (srcSize < lowLimit) ? (lowLimit-srcSize) >> 12 : 0;  /* from 64 to 0 */
     return srcSize + (srcSize >> 8) + margin;
 }