compression optimization opportunity

switch to single-pass mode directly into output buffer
when outputSize >= ZSTD_compressBound(inputSize).
Speed gains observed with fullbench (~+15% on level 1)
diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h
index b2c7cfc..59761b1 100644
--- a/lib/common/zstd_internal.h
+++ b/lib/common/zstd_internal.h
@@ -72,6 +72,7 @@
 #if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=2)
 #  include <stdio.h>
 /* recommended values for ZSTD_DEBUG display levels :
+ * 1 : no display, enables assert() only
  * 2 : reserved for currently active debugging path
  * 3 : events once per object lifetime (CCtx, CDict)
  * 4 : events once per frame
diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c
index f492d92..d4c029e 100644
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@@ -2954,6 +2954,8 @@
     const BYTE* const ip = (const BYTE*) src;
     size_t fhSize = 0;
 
+    DEBUGLOG(5, "ZSTD_compressContinue_internal");
+    DEBUGLOG(5, "stage: %u", cctx->stage);
     if (cctx->stage==ZSTDcs_created) return ERROR(stage_wrong);   /* missing init (ZSTD_compressBegin) */
 
     if (frame && (cctx->stage==ZSTDcs_init)) {
@@ -3211,9 +3213,9 @@
                                    ZSTD_parameters params, U64 pledgedSrcSize,
                                    ZSTD_buffered_policy_e zbuff)
 {
-    DEBUGLOG(5, "ZSTD_compressBegin_internal");
-    DEBUGLOG(5, "dict ? %s", dict ? "dict" : cdict ? "cdict" : "none");
-    DEBUGLOG(5, "dictMode : %u", (U32)dictMode);
+    DEBUGLOG(4, "ZSTD_compressBegin_internal");
+    DEBUGLOG(4, "dict ? %s", dict ? "dict" : cdict ? "cdict" : "none");
+    DEBUGLOG(4, "dictMode : %u", (U32)dictMode);
     /* params are supposed to be fully validated at this point */
     assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
     assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
@@ -3633,7 +3635,7 @@
                     const ZSTD_CDict* cdict,
                     ZSTD_parameters params, unsigned long long pledgedSrcSize)
 {
-    DEBUGLOG(5, "ZSTD_resetCStream_internal");
+    DEBUGLOG(4, "ZSTD_resetCStream_internal");
     /* params are supposed to be fully validated at this point */
     assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
     assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
@@ -3766,6 +3768,9 @@
     return length;
 }
 
+/** ZSTD_compressStream_generic():
+ *  internal function for all *compressStream*() variants and *compress_generic()
+ * @return : hint size for next input */
 static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                                         ZSTD_outBuffer* output,
                                         ZSTD_inBuffer* input,
@@ -3780,7 +3785,7 @@
     U32 someMoreWork = 1;
 
     /* check expectations */
-    DEBUGLOG(5, "ZSTD_compressStream_generic");
+    DEBUGLOG(5, "ZSTD_compressStream_generic, order %u", (U32)flushMode);
     assert(zcs->inBuff != NULL);
     assert(zcs->inBuffSize>0);
     assert(zcs->outBuff!= NULL);
@@ -3796,6 +3801,20 @@
             return ERROR(init_missing);
 
         case zcss_load:
+            if ( (flushMode == ZSTD_e_end)
+              && ((size_t)(oend-op) >= ZSTD_compressBound(iend-ip))
+              && (zcs->inBuffPos == 0) ) {
+                /* shortcut to compression pass directly into output buffer */
+                size_t const cSize = ZSTD_compressEnd(zcs,
+                                                op, oend-op, ip, iend-ip);
+                DEBUGLOG(4, "ZSTD_compressEnd : %u", (U32)cSize);
+                if (ZSTD_isError(cSize)) return cSize;
+                ip = iend;
+                op += cSize;
+                zcs->frameEnded = 1;
+                ZSTD_startNewCompression(zcs);
+                someMoreWork = 0; break;
+              }
             /* complete inBuffer */
             {   size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos;
                 size_t const loaded = ZSTD_limitCopy(
@@ -3922,8 +3941,8 @@
     if (input->pos  > input->size)  return ERROR(GENERIC);
     assert(cctx!=NULL);
 
+    /* transparent initialization stage */
     if (cctx->streamStage == zcss_init) {
-        /* transparent reset */
         const void* const prefix = cctx->prefix;
         size_t const prefixSize = cctx->prefixSize;
         ZSTD_parameters params = cctx->requestedParams;
@@ -3944,6 +3963,7 @@
             CHECK_F( ZSTD_resetCStream_internal(cctx, prefix, prefixSize, cctx->dictMode, cctx->cdict, params, cctx->pledgedSrcSizePlusOne-1) );
     }   }
 
+    /* compression stage */
 #ifdef ZSTD_MULTITHREAD
     if (cctx->nbThreads > 1) {
         size_t const flushMin = ZSTDMT_compressStream_generic(cctx->mtctx, output, input, endOp);
@@ -3956,7 +3976,6 @@
     }
 #endif
 
-    DEBUGLOG(5, "calling ZSTD_compressStream_generic(%i,...)", endOp);
     CHECK_F( ZSTD_compressStream_generic(cctx, output, input, endOp) );
     DEBUGLOG(5, "completed ZSTD_compress_generic");
     return cctx->outBuffContentSize - cctx->outBuffFlushedSize; /* remaining to flush */