modified streaming decompression API
diff --git a/lib/zstd_buffered.c b/lib/zstd_buffered.c
index 48721d6..aab83e6 100644
--- a/lib/zstd_buffered.c
+++ b/lib/zstd_buffered.c
@@ -335,8 +335,6 @@
     size_t outStart;
     size_t outEnd;
     size_t hPos;
-    const char* dict;
-    size_t dictSize;
     ZBUFF_dStage stage;
     unsigned char headerBuffer[ZSTD_frameHeaderSize_max];
 };   /* typedef'd to ZBUFF_DCtx within "zstd_buffered.h" */
@@ -365,19 +363,16 @@
 
 /* *** Initialization *** */
 
-size_t ZBUFF_decompressInit(ZBUFF_DCtx* zbc)
+size_t ZBUFF_decompressInitDictionary(ZBUFF_DCtx* zbc, const void* dict, size_t dictSize)
 {
     zbc->stage = ZBUFFds_readHeader;
-    zbc->hPos = zbc->inPos = zbc->outStart = zbc->outEnd = zbc->dictSize = 0;
-    return ZSTD_resetDCtx(zbc->zc);
+    zbc->hPos = zbc->inPos = zbc->outStart = zbc->outEnd = 0;
+    return ZSTD_decompressBegin_usingDict(zbc->zc, dict, dictSize);
 }
 
-
-size_t ZBUFF_decompressWithDictionary(ZBUFF_DCtx* zbc, const void* src, size_t srcSize)
+size_t ZBUFF_decompressInit(ZBUFF_DCtx* zbc)
 {
-    zbc->dict = (const char*)src;
-    zbc->dictSize = srcSize;
-    return 0;
+    return ZBUFF_decompressInitDictionary(zbc, NULL, 0);
 }
 
 
@@ -458,8 +453,6 @@
                         if (zbc->outBuff == NULL) return ERROR(memory_allocation);
                     }
                 }
-                if (zbc->dictSize)
-                    ZSTD_decompress_insertDictionary(zbc->zc, zbc->dict, zbc->dictSize);
                 if (zbc->hPos)
                 {
                     /* some data already loaded into headerBuffer : transfer into inBuff */
diff --git a/lib/zstd_buffered.h b/lib/zstd_buffered.h
index d2316a8..63101a1 100644
--- a/lib/zstd_buffered.h
+++ b/lib/zstd_buffered.h
@@ -48,7 +48,7 @@
 
 
 /* ***************************************************************
-*  Tuning parameters
+*  Compiler specifics
 *****************************************************************/
 /*!
 *  ZSTD_DLL_EXPORT :
@@ -69,7 +69,7 @@
 ZSTDLIB_API size_t      ZBUFF_freeCCtx(ZBUFF_CCtx* cctx);
 
 ZSTDLIB_API size_t ZBUFF_compressInit(ZBUFF_CCtx* cctx, int compressionLevel);
-ZSTDLIB_API size_t ZBUFF_compressWithDictionary(ZBUFF_CCtx* cctx, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZBUFF_compressWithDictionary(ZBUFF_CCtx* cctx, const void* dict, size_t dictSize);
 ZSTDLIB_API size_t ZBUFF_compressContinue(ZBUFF_CCtx* cctx, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr);
 ZSTDLIB_API size_t ZBUFF_compressFlush(ZBUFF_CCtx* cctx, void* dst, size_t* maxDstSizePtr);
 ZSTDLIB_API size_t ZBUFF_compressEnd(ZBUFF_CCtx* cctx, void* dst, size_t* maxDstSizePtr);
@@ -119,7 +119,7 @@
 ZSTDLIB_API size_t      ZBUFF_freeDCtx(ZBUFF_DCtx* dctx);
 
 ZSTDLIB_API size_t ZBUFF_decompressInit(ZBUFF_DCtx* dctx);
-ZSTDLIB_API size_t ZBUFF_decompressWithDictionary(ZBUFF_DCtx* dctx, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZBUFF_decompressInitDictionary(ZBUFF_DCtx* dctx, const void* dict, size_t dictSize);
 
 ZSTDLIB_API size_t ZBUFF_decompressContinue(ZBUFF_DCtx* dctx, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr);
 
@@ -128,19 +128,16 @@
 *
 *  A ZBUFF_DCtx object is required to track streaming operation.
 *  Use ZBUFF_createDCtx() and ZBUFF_freeDCtx() to create/release resources.
-*  Use ZBUFF_decompressInit() to start a new decompression operation.
-*  ZBUFF_DCtx objects can be reused multiple times.
-*
-*  Optionally, a reference to a static dictionary can be set, using ZBUFF_decompressWithDictionary()
-*  It must be the same content as the one set during compression phase.
-*  Dictionary content must remain accessible during the decompression process.
+*  Use ZBUFF_decompressInit() to start a new decompression operation,
+*   or ZBUFF_decompressInitDictionary() if decompression requires a dictionary.
+*  Note that ZBUFF_DCtx objects can be reused multiple times.
 *
 *  Use ZBUFF_decompressContinue() repetitively to consume your input.
 *  *srcSizePtr and *maxDstSizePtr can be any size.
 *  The function will report how many bytes were read or written by modifying *srcSizePtr and *maxDstSizePtr.
 *  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
 *  The content of dst will be overwritten (up to *maxDstSizePtr) at each function call, so save its content if it matters or change dst.
-*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency)
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency)
 *            or 0 when a frame is completely decoded
 *            or an error code, which can be tested using ZBUFF_isError().
 *
@@ -157,7 +154,7 @@
 ZSTDLIB_API const char* ZBUFF_getErrorName(size_t errorCode);
 
 /** The below functions provide recommended buffer sizes for Compression or Decompression operations.
-*   These sizes are not compulsory, they just tend to offer better latency */
+*   These sizes are just hints, and tend to offer better latency */
 ZSTDLIB_API size_t ZBUFF_recommendedCInSize(void);
 ZSTDLIB_API size_t ZBUFF_recommendedCOutSize(void);
 ZSTDLIB_API size_t ZBUFF_recommendedDInSize(void);
diff --git a/lib/zstd_compress.c b/lib/zstd_compress.c
index db01c33..4124e4f 100644
--- a/lib/zstd_compress.c
+++ b/lib/zstd_compress.c
@@ -225,6 +225,42 @@
 }
 
 
+/*! ZSTD_copyCCtx
+*   Duplicate an existing context @srcCCtx into another one @dstCCtx.
+*   Only works during stage 0 (i.e. before first call to ZSTD_compressContinue())
+*   @return : 0, or an error code */
+size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx)
+{
+    const U32 contentLog = (srcCCtx->params.strategy == ZSTD_fast) ? 1 : srcCCtx->params.contentLog;
+    const size_t tableSpace = ((1 << contentLog) + (1 << srcCCtx->params.hashLog)) * sizeof(U32);
+
+    if (srcCCtx->stage!=0) return ERROR(stage_wrong);
+
+    ZSTD_resetCCtx_advanced(dstCCtx, srcCCtx->params);
+
+    /* copy tables */
+    memcpy(dstCCtx->hashTable, srcCCtx->hashTable, tableSpace);
+
+    /* copy frame header */
+    dstCCtx->hbSize = srcCCtx->hbSize;
+    memcpy(dstCCtx->headerBuffer , srcCCtx->headerBuffer, srcCCtx->hbSize);
+
+    /* copy dictionary pointers */
+    dstCCtx->nextToUpdate= srcCCtx->nextToUpdate;
+    dstCCtx->nextSrc     = srcCCtx->nextSrc;
+    dstCCtx->base        = srcCCtx->base;
+    dstCCtx->dictBase    = srcCCtx->dictBase;
+    dstCCtx->dictLimit   = srcCCtx->dictLimit;
+    dstCCtx->lowLimit    = srcCCtx->lowLimit;
+
+    dstCCtx->flagHufTable = srcCCtx->flagHufTable;
+    if (dstCCtx->flagHufTable)
+        memcpy(dstCCtx->hufTable, srcCCtx->hufTable, 256*4);
+
+    return 0;
+}
+
+
 /** ZSTD_reduceIndex
 *   rescale indexes to avoid future overflow (indexes are U32) */
 static void ZSTD_reduceIndex (ZSTD_CCtx* zc,
@@ -2133,49 +2169,17 @@
 
 size_t ZSTD_compress_insertDictionary(ZSTD_CCtx* zc, const void* dict, size_t dictSize)
 {
-    U32 magic = MEM_readLE32(dict);
-    U32 eSize;
-    if (magic != ZSTD_DICT_MAGIC)
-        return ZSTD_loadDictionaryContent(zc, dict, dictSize);
+    if (dict && dictSize)
+    {
+        U32 magic = MEM_readLE32(dict);
+        size_t eSize;
+        if (magic != ZSTD_DICT_MAGIC)
+            return ZSTD_loadDictionaryContent(zc, dict, dictSize);
 
-    eSize = ZSTD_loadDictEntropyStats(zc, (const char*)dict+4, dictSize-4) + 4;
-    if (ZSTD_isError(eSize)) return eSize;
-    return ZSTD_loadDictionaryContent(zc, (const char*)dict+eSize, dictSize-eSize);
-}
-
-
-/*! ZSTD_duplicateCCtx
-*   Duplicate an existing context @srcCCtx into another one @dstCCtx.
-*   Only works during stage 0 (i.e. before first call to ZSTD_compressContinue())
-*   @return : 0, or an error code */
-size_t ZSTD_duplicateCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx)
-{
-    const U32 contentLog = (srcCCtx->params.strategy == ZSTD_fast) ? 1 : srcCCtx->params.contentLog;
-    const size_t tableSpace = ((1 << contentLog) + (1 << srcCCtx->params.hashLog)) * sizeof(U32);
-
-    if (srcCCtx->stage!=0) return ERROR(stage_wrong);
-
-    ZSTD_resetCCtx_advanced(dstCCtx, srcCCtx->params);
-
-    /* copy tables */
-    memcpy(dstCCtx->hashTable, srcCCtx->hashTable, tableSpace);
-
-    /* copy frame header */
-    dstCCtx->hbSize = srcCCtx->hbSize;
-    memcpy(dstCCtx->headerBuffer , srcCCtx->headerBuffer, srcCCtx->hbSize);
-
-    /* copy dictionary pointers */
-    dstCCtx->nextToUpdate= srcCCtx->nextToUpdate;
-    dstCCtx->nextSrc     = srcCCtx->nextSrc;
-    dstCCtx->base        = srcCCtx->base;
-    dstCCtx->dictBase    = srcCCtx->dictBase;
-    dstCCtx->dictLimit   = srcCCtx->dictLimit;
-    dstCCtx->lowLimit    = srcCCtx->lowLimit;
-
-    dstCCtx->flagHufTable = srcCCtx->flagHufTable;
-    if (dstCCtx->flagHufTable)
-        memcpy(dstCCtx->hufTable, srcCCtx->hufTable, 256*4);
-
+        eSize = ZSTD_loadDictEntropyStats(zc, (const char*)dict+4, dictSize-4) + 4;
+        if (ZSTD_isError(eSize)) return eSize;
+        return ZSTD_loadDictionaryContent(zc, (const char*)dict+eSize, dictSize-eSize);
+    }
     return 0;
 }
 
diff --git a/lib/zstd_decompress.c b/lib/zstd_decompress.c
index ce54d20..93e8288 100644
--- a/lib/zstd_decompress.c
+++ b/lib/zstd_decompress.c
@@ -136,6 +136,7 @@
     ZSTD_parameters params;
     blockType_t bType;
     ZSTD_dStage stage;
+    U32 flagHufTable;
     const BYTE* litPtr;
     size_t litBufSize;
     size_t litSize;
@@ -143,7 +144,7 @@
     BYTE headerBuffer[ZSTD_frameHeaderSize_max];
 };  /* typedef'd to ZSTD_DCtx within "zstd_static.h" */
 
-size_t ZSTD_resetDCtx(ZSTD_DCtx* dctx)
+size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
 {
     dctx->expected = ZSTD_frameHeaderSize_min;
     dctx->stage = ZSTDds_getFrameHeaderSize;
@@ -152,6 +153,7 @@
     dctx->vBase = NULL;
     dctx->dictEnd = NULL;
     dctx->hufTableX4[0] = HufLog;
+    dctx->flagHufTable = 0;
     return 0;
 }
 
@@ -159,7 +161,7 @@
 {
     ZSTD_DCtx* dctx = (ZSTD_DCtx*)malloc(sizeof(ZSTD_DCtx));
     if (dctx==NULL) return NULL;
-    ZSTD_resetDCtx(dctx);
+    ZSTD_decompressBegin(dctx);
     return dctx;
 }
 
@@ -169,6 +171,12 @@
     return 0;
 }
 
+void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
+{
+    memcpy(dstDCtx, srcDCtx,
+           sizeof(ZSTD_DCtx) - (BLOCKSIZE+WILDCOPY_OVERLENGTH + ZSTD_frameHeaderSize_max));  /* no need to copy workspace */
+}
+
 
 /* *************************************************************
 *   Decompression section
@@ -342,6 +350,8 @@
             U32 lhSize = ((istart[0]) >> 4) & 3;
             if (lhSize != 1)  /* only case supported for now : small litSize, single stream */
                 return ERROR(corruption_detected);
+            if (!dctx->flagHufTable)
+                return ERROR(dictionary_corrupted);
 
             /* 2 - 2 - 10 - 10 */
             lhSize=3;
@@ -631,14 +641,12 @@
     *litPtr = litEnd;   /* update for next sequence */
 
     /* copy Match */
-    if (sequence.offset > (size_t)(oLitEnd - base))
-    {
+    if (sequence.offset > (size_t)(oLitEnd - base)) {
         /* offset beyond prefix */
         if (sequence.offset > (size_t)(oLitEnd - vBase))
             return ERROR(corruption_detected);
         match = dictEnd - (base-match);
-        if (match + sequence.matchLength <= dictEnd)
-        {
+        if (match + sequence.matchLength <= dictEnd) {
             memmove(oLitEnd, match, sequence.matchLength);
             return sequenceLength;
         }
@@ -653,8 +661,7 @@
     }
 
     /* match within prefix */
-    if (sequence.offset < 8)
-    {
+    if (sequence.offset < 8) {
         /* close range match, overlap */
         const int sub2 = dec64table[sequence.offset];
         op[0] = match[0];
@@ -665,24 +672,20 @@
         ZSTD_copy4(op+4, match);
         match -= sub2;
     }
-    else
-    {
+    else {
         ZSTD_copy8(op, match);
     }
     op += 8; match += 8;
 
-    if (oMatchEnd > oend-12)
-    {
-        if (op < oend_8)
-        {
+    if (oMatchEnd > oend-12) {
+        if (op < oend_8) {
             ZSTD_wildcopy(op, match, oend_8 - op);
             match += oend_8 - op;
             op = oend_8;
         }
         while (op < oMatchEnd) *op++ = *match++;
     }
-    else
-    {
+    else {
         ZSTD_wildcopy(op, match, sequence.matchLength-8);   /* works even if matchLength < 8 */
     }
     return sequenceLength;
@@ -735,8 +738,7 @@
         FSE_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
         FSE_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
 
-        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; )
-        {
+        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; ) {
             size_t oneSeqSize;
             nbSeq--;
             ZSTD_decodeSequence(&sequence, &seqState);
@@ -764,8 +766,7 @@
 
 static void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
 {
-    if (dst != dctx->previousDstEnd)   /* not contiguous */
-    {
+    if (dst != dctx->previousDstEnd) {   /* not contiguous */
         dctx->dictEnd = dctx->previousDstEnd;
         dctx->vBase = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
         dctx->base = dst;
@@ -800,10 +801,11 @@
 }
 
 
-size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+/*! ZSTD_decompress_continueDCtx
+*   dctx must have been properly initialized */
+static size_t ZSTD_decompress_continueDCtx(ZSTD_DCtx* dctx,
                                  void* dst, size_t maxDstSize,
-                                 const void* src, size_t srcSize,
-                                 const void* dict, size_t dictSize)
+                                 const void* src, size_t srcSize)
 {
     const BYTE* ip = (const BYTE*)src;
     const BYTE* iend = ip + srcSize;
@@ -813,21 +815,6 @@
     size_t remainingSize = srcSize;
     blockProperties_t blockProperties;
 
-    /* init */
-    ZSTD_resetDCtx(dctx);
-    if (dict)
-    {
-        size_t errorCode = ZSTD_decompress_insertDictionary(dctx, dict, dictSize);
-        if (ZSTD_isError(errorCode)) return ERROR(dictionary_corrupted);
-        dctx->dictEnd = dctx->previousDstEnd;
-        dctx->vBase = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
-        dctx->base = dst;
-    }
-    else
-    {
-        dctx->vBase = dctx->base = dctx->dictEnd = dst;
-    }
-
     /* Frame Header */
     {
         size_t frameHeaderSize;
@@ -888,6 +875,27 @@
 }
 
 
+size_t ZSTD_decompress_usingPreparedDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* refDCtx,
+                                         void* dst, size_t maxDstSize,
+                                   const void* src, size_t srcSize)
+{
+    ZSTD_copyDCtx(dctx, refDCtx);
+    ZSTD_checkContinuity(dctx, dst);
+    return ZSTD_decompress_continueDCtx(dctx, dst, maxDstSize, src, srcSize);
+}
+
+
+size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                                 const void* src, size_t srcSize,
+                                 const void* dict, size_t dictSize)
+{
+    ZSTD_decompressBegin_usingDict(dctx, dict, dictSize);
+    ZSTD_checkContinuity(dctx, dst);
+    return ZSTD_decompress_continueDCtx(dctx, dst, maxDstSize, src, srcSize);
+}
+
+
 size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
     return ZSTD_decompress_usingDict(dctx, dst, maxDstSize, src, srcSize, NULL, 0);
@@ -933,8 +941,7 @@
             dctx->headerSize = ZSTD_decodeFrameHeader_Part1(dctx, src, ZSTD_frameHeaderSize_min);
             if (ZSTD_isError(dctx->headerSize)) return dctx->headerSize;
             memcpy(dctx->headerBuffer, src, ZSTD_frameHeaderSize_min);
-            if (dctx->headerSize > ZSTD_frameHeaderSize_min)
-            {
+            if (dctx->headerSize > ZSTD_frameHeaderSize_min) {
                 dctx->expected = dctx->headerSize - ZSTD_frameHeaderSize_min;
                 dctx->stage = ZSTDds_decodeFrameHeader;
                 return 0;
@@ -958,13 +965,11 @@
             blockProperties_t bp;
             size_t blockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
             if (ZSTD_isError(blockSize)) return blockSize;
-            if (bp.blockType == bt_end)
-            {
+            if (bp.blockType == bt_end) {
                 dctx->expected = 0;
                 dctx->stage = ZSTDds_getFrameHeaderSize;
             }
-            else
-            {
+            else {
                 dctx->expected = blockSize;
                 dctx->bType = bp.blockType;
                 dctx->stage = ZSTDds_decompressBlock;
@@ -990,7 +995,7 @@
                 rSize = 0;
                 break;
             default:
-                return ERROR(GENERIC);
+                return ERROR(GENERIC);   /* impossible */
             }
             dctx->stage = ZSTDds_decodeBlockHeader;
             dctx->expected = ZSTD_blockHeaderSize;
@@ -1011,15 +1016,15 @@
     dctx->previousDstEnd = (const char*)dict + dictSize;
 }
 
-
 static size_t ZSTD_loadEntropy(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
 {
     size_t hSize = HUF_readDTableX4(dctx->hufTableX4, dict, dictSize);
     if (HUF_isError(hSize)) return ERROR(dictionary_corrupted);
+    dctx->flagHufTable = 1;
     return hSize;
 }
 
-size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
 {
     size_t eSize;
     U32 magic = MEM_readLE32(dict);
@@ -1042,3 +1047,18 @@
     return 0;
 }
 
+
+size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    size_t errorCode;
+    errorCode = ZSTD_decompressBegin(dctx);
+    if (ZSTD_isError(errorCode)) return errorCode;
+
+    if (dict && dictSize) {
+        errorCode = ZSTD_decompress_insertDictionary(dctx, dict, dictSize);
+        if (ZSTD_isError(errorCode)) return ERROR(dictionary_corrupted);
+    }
+
+    return 0;
+}
+
diff --git a/lib/zstd_static.h b/lib/zstd_static.h
index f99893d..fd4131a 100644
--- a/lib/zstd_static.h
+++ b/lib/zstd_static.h
@@ -92,7 +92,7 @@
 ZSTDLIB_API void ZSTD_validateParams(ZSTD_parameters* params);
 
 /** ZSTD_compress_usingDict
-*   Same as ZSTD_compressCCtx(), using a Dictionary content as prefix
+*   Same as ZSTD_compressCCtx(), loading a Dictionary content.
 *   Note : dict can be NULL, in which case, it's equivalent to ZSTD_compressCCtx() */
 ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx,
                                            void* dst, size_t maxDstSize,
@@ -108,14 +108,24 @@
                                      const void* dict,size_t dictSize,
                                            ZSTD_parameters params);
 
-/** ZSTD_decompress_usingDict
+/*! ZSTD_decompress_usingDict
 *   Same as ZSTD_decompressDCtx, using a Dictionary content as prefix
 *   Note : dict can be NULL, in which case, it's equivalent to ZSTD_decompressDCtx() */
-ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* ctx,
+ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
                                              void* dst, size_t maxDstSize,
                                        const void* src, size_t srcSize,
                                        const void* dict,size_t dictSize);
 
+/*! ZSTD_decompress_usingPreparedDCtx
+*   Same as ZSTD_decompress_usingDict, but using a reference context preparedDCtx, where dictionary has already been loaded into.
+*   It avoids reloading the dictionary each time.
+*   preparedDCtx must have been properly initialized using ZSTD_compressBegin_usingDict().
+*   Requires 2 contexts : 1 for reference, which will not be modified, and 1 to run the decompression operation */
+ZSTDLIB_API size_t ZSTD_decompress_usingPreparedDCtx(
+                                             ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx,
+                                             void* dst, size_t maxDstSize,
+                                       const void* src, size_t srcSize);
+
 
 /* **************************************
 *  Streaming functions (direct mode)
@@ -127,7 +137,7 @@
 ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* ctx, ZSTD_parameters params);
 
 ZSTDLIB_API size_t ZSTD_compress_insertDictionary(ZSTD_CCtx* ctx, const void* dict, size_t dictSize);
-ZSTDLIB_API size_t ZSTD_duplicateCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx);
+ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx);
 
 ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
 ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t maxDstSize);
@@ -164,9 +174,11 @@
 */
 
 
-ZSTDLIB_API size_t ZSTD_resetDCtx(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIB_API void   ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx);
+
 ZSTDLIB_API size_t ZSTD_getFrameParams(ZSTD_parameters* params, const void* src, size_t srcSize);
-ZSTDLIB_API size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* ctx, const void* src, size_t srcSize);
 
 ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
 ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
@@ -176,20 +188,19 @@
 
   A ZSTD_DCtx object is required to track streaming operations.
   Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
-  A ZSTD_DCtx object can be re-used multiple times. Use ZSTD_resetDCtx() to return to fresh status.
+  A ZSTD_DCtx object can be re-used multiple times.
 
-  First operation is to retrieve frame parameters, using ZSTD_getFrameParams().
-  This function doesn't consume its input. It needs enough input data to properly decode the frame header.
+  First typical operation is to retrieve frame parameters, using ZSTD_getFrameParams().
+  This operation is independent, and just needs enough input data to properly decode the frame header.
   Objective is to retrieve *params.windowlog, to know minimum amount of memory required during decoding.
   Result : 0 when successful, it means the ZSTD_parameters structure has been filled.
            >0 : means there is not enough data into src. Provides the expected size to successfully decode header.
-           errorCode, which can be tested using ZSTD_isError() (For example, if it's not a ZSTD header)
+           errorCode, which can be tested using ZSTD_isError()
 
-  Then, you can optionally insert a dictionary.
-  This operation must mimic the compressor behavior, otherwise decompression will fail or be corrupted.
+  Start decompression, with ZSTD_decompressBegin() or ZSTD_decompressBegin_usingDict()
+  Alternatively, you can copy a prepared context, using ZSTD_copyDCtx()
 
-  Then it's possible to start decompression.
-  Use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively.
+  Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively.
   ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
   ZSTD_decompressContinue() requires this exact amount of bytes, or it will fail.
   ZSTD_decompressContinue() needs previous data blocks during decompression, up to (1 << windowlog).
@@ -206,23 +217,23 @@
 /* **************************************
 *  Block functions
 ****************************************/
-
 /*!Block functions produce and decode raw zstd blocks, without frame metadata.
-   It saves associated header sizes.
-   But user will have to save and regenerate fields required to regenerate data, such as block sizes.
+   Frame headers won't be generated.
+   User will have to save and regenerate fields required to regenerate data, such as block sizes.
 
    A few rules to respect :
    - Uncompressed block size must be <= 128 KB
-   - Compressing or decompressing require a context structure
+   - Compressing or decompressing requires a context structure
      + Use ZSTD_createXCtx() to create them
    - It is necessary to init context before starting
-     + compression : ZSTD_compressBegin(), which allows selection of compression level or parameters
-     + decompression : ZSTD_resetDCtx()
-     + If you compress multiple blocks without resetting, next blocks will create references to previous ones
-   - Dictionary can optionally be inserted, using ZSTD_de/compress_insertDictionary()
+     + compression : ZSTD_compressBegin()
+     + decompression : ZSTD_decompressBegin()
+     + variants _usingDict() are also allowed
+     + copyXCtx() works too
    - When a block is considered not compressible enough, ZSTD_compressBlock() result will be zero.
-     + User must test for such outcome and be able to deal with uncompressed data
-     + ZSTD_decompressBlock() doesn't accept uncompressed data as input
+     In which case, nothing is produced into `dst`.
+     + User must test for such outcome and deal directly with uncompressed data
+     + ZSTD_decompressBlock() doesn't accept uncompressed data as input !!
 */
 
 size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
diff --git a/programs/bench.c b/programs/bench.c
index 3055566..f5b6f8d 100644
--- a/programs/bench.c
+++ b/programs/bench.c
@@ -227,6 +227,7 @@
     void* const resultBuffer = malloc(srcSize);
     ZSTD_CCtx* refCtx = ZSTD_createCCtx();
     ZSTD_CCtx* ctx = ZSTD_createCCtx();
+    ZSTD_DCtx* refDCtx = ZSTD_createDCtx();
     ZSTD_DCtx* dctx = ZSTD_createDCtx();
     U64 crcOrig = XXH64(srcBuffer, srcSize, 0);
     U32 nbBlocks = 0;
@@ -235,7 +236,7 @@
     if (strlen(displayName)>17) displayName += strlen(displayName)-17;   /* can only display 17 characters */
 
     /* Memory allocation & restrictions */
-    if (!compressedBuffer || !resultBuffer || !blockTable || !refCtx || !ctx || !dctx)
+    if (!compressedBuffer || !resultBuffer || !blockTable || !refCtx || !ctx || !refDCtx || !dctx)
         EXM_THROW(31, "not enough memory");
 
     /* Init blockTable data */
@@ -298,7 +299,7 @@
                 ZSTD_compress_insertDictionary(refCtx, dictBuffer, dictBufferSize);
                 for (blockNb=0; blockNb<nbBlocks; blockNb++)
                 {
-                    ZSTD_duplicateCCtx(ctx, refCtx);
+                    ZSTD_copyCCtx(ctx, refCtx);
                     size_t rSize = ZSTD_compressContinue(ctx,
                                           blockTable[blockNb].cPtr,  blockTable[blockNb].cRoom,
                                           blockTable[blockNb].srcPtr,blockTable[blockNb].srcSize);
@@ -323,41 +324,42 @@
 
 #if 1
             /* Decompression */
-            memset(resultBuffer, 0xD6, srcSize);
+            memset(resultBuffer, 0xD6, srcSize);  /* warm result buffer */
 
             nbLoops = 0;
             milliTime = BMK_GetMilliStart();
             while (BMK_GetMilliStart() == milliTime);
             milliTime = BMK_GetMilliStart();
+
+            ZSTD_decompressBegin_usingDict(refDCtx, dictBuffer, dictBufferSize);
             for ( ; BMK_GetMilliSpan(milliTime) < TIMELOOP; nbLoops++) {
                 for (blockNb=0; blockNb<nbBlocks; blockNb++) {
-                    blockTable[blockNb].resSize = ZSTD_decompress_usingDict(dctx,
-                        blockTable[blockNb].resPtr, blockTable[blockNb].srcSize,
-                        blockTable[blockNb].cPtr, blockTable[blockNb].cSize,
-                        dictBuffer, dictBufferSize);
-                    if (ZSTD_isError(blockTable[blockNb].resSize))
-                        EXM_THROW(3, "ZSTD_decompress_usingDict() failed : %s", ZSTD_getErrorName(blockTable[blockNb].resSize));
-            }   }
-            milliTime = BMK_GetMilliSpan(milliTime);
+                    size_t regenSize;
 
+                    regenSize = ZSTD_decompress_usingPreparedDCtx(dctx, refDCtx,
+                        blockTable[blockNb].resPtr, blockTable[blockNb].srcSize,
+                        blockTable[blockNb].cPtr, blockTable[blockNb].cSize);
+
+                    if (ZSTD_isError(regenSize))
+                        EXM_THROW(3, "ZSTD_decompress_usingDict() failed : %s", ZSTD_getErrorName(regenSize));
+                    blockTable[blockNb].resSize = regenSize;
+            }   }
+
+            milliTime = BMK_GetMilliSpan(milliTime);
             if ((double)milliTime < fastestD*nbLoops) fastestD = (double)milliTime / nbLoops;
             DISPLAY("%2i-%-17.17s :%10i ->%10i (%5.3f),%6.1f MB/s ,%6.1f MB/s\r", loopNb, displayName, (int)srcSize, (int)cSize, ratio, (double)srcSize / fastestC / 1000., (double)srcSize / fastestD / 1000.);
 
             /* CRC Checking */
             crcCheck = XXH64(resultBuffer, srcSize, 0);
-            if (crcOrig!=crcCheck)
-            {
+            if (crcOrig!=crcCheck) {
                 size_t u;
                 DISPLAY("\n!!! WARNING !!! %14s : Invalid Checksum : %x != %x\n", displayName, (unsigned)crcOrig, (unsigned)crcCheck);
-                for (u=0; u<srcSize; u++)
-                {
-                    if (((const BYTE*)srcBuffer)[u] != ((const BYTE*)resultBuffer)[u])
-                    {
+                for (u=0; u<srcSize; u++) {
+                    if (((const BYTE*)srcBuffer)[u] != ((const BYTE*)resultBuffer)[u]) {
                         U32 segNb, bNb, pos;
                         size_t bacc = 0;
                         printf("Decoding error at pos %u ", (U32)u);
-                        for (segNb = 0; segNb < nbBlocks; segNb++)
-                        {
+                        for (segNb = 0; segNb < nbBlocks; segNb++) {
                             if (bacc + blockTable[segNb].srcSize > u) break;
                             bacc += blockTable[segNb].srcSize;
                         }
@@ -365,8 +367,7 @@
                         bNb = pos / (128 KB);
                         printf("(segment %u, block %u, pos %u) \n", segNb, bNb, pos);
                         break;
-                    }
-                }
+                }   }
                 break;
             }
 #endif
@@ -375,7 +376,7 @@
         if (crcOrig == crcCheck)
             DISPLAY("%2i-%-17.17s :%10i ->%10i (%5.3f),%6.1f MB/s ,%6.1f MB/s \n", cLevel, displayName, (int)srcSize, (int)cSize, ratio, (double)srcSize / fastestC / 1000., (double)srcSize / fastestD / 1000.);
         else
-            DISPLAY("X \n");
+            DISPLAY("\n");
     }
 
     /* clean up */
@@ -383,6 +384,7 @@
     free(resultBuffer);
     ZSTD_freeCCtx(refCtx);
     ZSTD_freeCCtx(ctx);
+    ZSTD_freeDCtx(refDCtx);
     ZSTD_freeDCtx(dctx);
     return 0;
 }
diff --git a/programs/fileio.c b/programs/fileio.c
index 0d49af2..4bab75e 100644
--- a/programs/fileio.c
+++ b/programs/fileio.c
@@ -529,8 +529,7 @@
     size_t readSize=alreadyLoaded;
 
     /* Main decompression Loop */
-    ZBUFF_decompressInit(ress.dctx);
-    ZBUFF_decompressWithDictionary(ress.dctx, ress.dictBuffer, ress.dictBufferSize);
+    ZBUFF_decompressInitDictionary(ress.dctx, ress.dictBuffer, ress.dictBufferSize);
     while (1)
     {
         /* Decode */
diff --git a/programs/fuzzer.c b/programs/fuzzer.c
index 4058ef2..b72f7e5 100644
--- a/programs/fuzzer.c
+++ b/programs/fuzzer.c
@@ -207,7 +207,7 @@
         if (ZSTD_isError(result)) goto _output_error;
         result = ZSTD_compress_insertDictionary(ctxOrig, CNBuffer, dictSize);
         if (ZSTD_isError(result)) goto _output_error;
-        result = ZSTD_duplicateCCtx(ctxDuplicated, ctxOrig);
+        result = ZSTD_copyCCtx(ctxDuplicated, ctxOrig);
         if (ZSTD_isError(result)) goto _output_error;
         DISPLAYLEVEL(4, "OK \n");
 
@@ -284,7 +284,7 @@
         DISPLAYLEVEL(4, "OK \n");
 
         DISPLAYLEVEL(4, "test%3i : Block decompression test : ", testNb++);
-        result = ZSTD_resetDCtx(dctx);
+        result = ZSTD_decompressBegin(dctx);
         if (ZSTD_isError(result)) goto _output_error;
         result = ZSTD_decompressBlock(dctx, decodedBuffer, COMPRESSIBLE_NOISE_LENGTH, compressedBuffer, cSize);
         if (ZSTD_isError(result)) goto _output_error;
@@ -302,9 +302,8 @@
         DISPLAYLEVEL(4, "OK \n");
 
         DISPLAYLEVEL(4, "test%3i : Dictionary Block decompression test : ", testNb++);
-        result = ZSTD_resetDCtx(dctx);
+        result = ZSTD_decompressBegin_usingDict(dctx, CNBuffer, dictSize);
         if (ZSTD_isError(result)) goto _output_error;
-        ZSTD_decompress_insertDictionary(dctx, CNBuffer, dictSize);
         result = ZSTD_decompressBlock(dctx, decodedBuffer, COMPRESSIBLE_NOISE_LENGTH, compressedBuffer, cSize);
         if (ZSTD_isError(result)) goto _output_error;
         if (result != blockSize) goto _output_error;
@@ -574,7 +573,7 @@
         CHECK (ZSTD_isError(errorCode), "start streaming error : %s", ZSTD_getErrorName(errorCode));
         errorCode = ZSTD_compress_insertDictionary(refCtx, dict, dictSize);
         CHECK (ZSTD_isError(errorCode), "dictionary insertion error : %s", ZSTD_getErrorName(errorCode));
-        errorCode = ZSTD_duplicateCCtx(ctx, refCtx);
+        errorCode = ZSTD_copyCCtx(ctx, refCtx);
         CHECK (ZSTD_isError(errorCode), "context duplication error : %s", ZSTD_getErrorName(errorCode));
         totalTestSize = 0; cSize = 0;
         for (n=0; n<nbChunks; n++)
@@ -603,9 +602,8 @@
         crcOrig = XXH64_digest(xxh64);
 
         /* streaming decompression test */
-        errorCode = ZSTD_resetDCtx(dctx);
+        errorCode = ZSTD_decompressBegin_usingDict(dctx, dict, dictSize);
         CHECK (ZSTD_isError(errorCode), "cannot init DCtx : %s", ZSTD_getErrorName(errorCode));
-        ZSTD_decompress_insertDictionary(dctx, dict, dictSize);
         totalCSize = 0;
         totalGenSize = 0;
         while (totalCSize < cSize)
diff --git a/programs/zbufftest.c b/programs/zbufftest.c
index f9677f0..4c1b7ba 100644
--- a/programs/zbufftest.c
+++ b/programs/zbufftest.c
@@ -174,8 +174,7 @@
 
     /* Basic decompression test */
     DISPLAYLEVEL(4, "test%3i : decompress %u bytes : ", testNb++, COMPRESSIBLE_NOISE_LENGTH);
-    ZBUFF_decompressInit(zd);
-    ZBUFF_decompressWithDictionary(zd, CNBuffer, 128 KB);
+    ZBUFF_decompressInitDictionary(zd, CNBuffer, 128 KB);
     readSize = cSize;
     genSize = CNBufferSize;
     result = ZBUFF_decompressContinue(zd, decodedBuffer, &genSize, compressedBuffer, &readSize);
@@ -374,8 +373,7 @@
         crcOrig = XXH64_digest(xxh64);
 
         /* multi - fragments decompression test */
-        ZBUFF_decompressInit(zd);
-        ZBUFF_decompressWithDictionary(zd, dict, dictSize);
+        ZBUFF_decompressInitDictionary(zd, dict, dictSize);
         totalCSize = 0;
         totalGenSize = 0;
         while (totalCSize < cSize)