Merge pull request #744 from terrelln/dev

[pzstd] Remove appveyor tests
diff --git a/Makefile b/Makefile
index 679e056..9afdf65 100644
--- a/Makefile
+++ b/Makefile
@@ -224,10 +224,10 @@
 	LDFLAGS=-fuse-ld=gold MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize=address" $(MAKE) -C $(TESTDIR) $*
 
 msan: clean
-	$(MAKE) test CC=clang MOREFLAGS="-g -fsanitize=memory -fno-omit-frame-pointer"   # datagen.c fails this test for no obvious reason
+	$(MAKE) test CC=clang MOREFLAGS="-g -fsanitize=memory -fno-omit-frame-pointer" HAVE_LZMA=0   # datagen.c fails this test for no obvious reason
 
 msan-%: clean
-	LDFLAGS=-fuse-ld=gold MOREFLAGS="-fno-sanitize-recover=all -fsanitize=memory -fno-omit-frame-pointer" FUZZER_FLAGS=--no-big-tests $(MAKE) -C $(TESTDIR) $*
+	LDFLAGS=-fuse-ld=gold MOREFLAGS="-fno-sanitize-recover=all -fsanitize=memory -fno-omit-frame-pointer" FUZZER_FLAGS=--no-big-tests $(MAKE) -C $(TESTDIR) HAVE_LZMA=0 $*
 
 asan32: clean
 	$(MAKE) -C $(TESTDIR) test32 CC=clang MOREFLAGS="-g -fsanitize=address"
diff --git a/NEWS b/NEWS
index db9df96..d23a58f 100644
--- a/NEWS
+++ b/NEWS
@@ -1,12 +1,15 @@
 v1.3.0
 cli : new : `--list` command, by Paul Cruz
+cli : changed : xz/lzma support enabled by default
 cli : changed : `-t *` continue processing list after a decompression error
 API : added : ZSTD_versionString()
+API : promoted to stable status : ZSTD_getFrameContentSize(), by Sean Purcell
 API exp : new advanced API : ZSTD_compress_generic(), ZSTD_CCtx_setParameter()
 API exp : new : API for static or external allocation : ZSTD_initStatic?Ctx()
 API exp : added : ZSTD_decompressBegin_usingDDict(), requested by Guy Riddle (#700)
+API exp : clarified memory estimation / measurement functions.
 API exp : changed : strongest strategy renamed ZSTD_btultra, fastest strategy ZSTD_fast set to 1
-API exp : clarified presentation of memory estimation / measurement functions.
+tools : decodecorpus can generate random dictionary-compressed samples, by Paul Cruz
 new : contrib/seekable_format, demo and API, by Sean Purcell
 changed : contrib/linux-kernel, updated version and license, by Nick Terrell
 
diff --git a/contrib/linux-kernel/0000-cover-letter.patch b/contrib/linux-kernel/0000-cover-letter.patch
new file mode 100644
index 0000000..763b5a9
--- /dev/null
+++ b/contrib/linux-kernel/0000-cover-letter.patch
@@ -0,0 +1,96 @@
+From 8bc9a0ae5c86a6d02d9a5274b9965ddac0e8d330 Mon Sep 17 00:00:00 2001
+From: Nick Terrell <terrelln@fb.com>
+Date: Wed, 28 Jun 2017 22:00:00 -0700
+Subject: [PATCH v2 0/4] Add xxhash and zstd modules
+
+Hi all,
+
+This patch set adds xxhash, zstd compression, and zstd decompression
+modules. It also adds zstd support to BtrFS and SquashFS.
+
+Each patch has relevant summaries, benchmarks, and tests.
+
+Best,
+Nick Terrell
+
+Changelog:
+
+v1 -> v2:
+- Make pointer in lib/xxhash.c:394 non-const (1/4)
+- Use div_u64() for division of u64s (2/4)
+- Reduce stack usage of ZSTD_compressSequences(), ZSTD_buildSeqTable(),
+  ZSTD_decompressSequencesLong(), FSE_buildDTable(), FSE_decompress_wksp(),
+  HUF_writeCTable(), HUF_readStats(), HUF_readCTable(),
+  HUF_compressWeights(), HUF_readDTableX2(), and HUF_readDTableX4() (2/4)
+- No zstd function uses more than 400 B of stack space (2/4)
+
+Nick Terrell (4):
+  lib: Add xxhash module
+  lib: Add zstd modules
+  btrfs: Add zstd support
+  squashfs: Add zstd support
+
+ fs/btrfs/Kconfig           |    2 +
+ fs/btrfs/Makefile          |    2 +-
+ fs/btrfs/compression.c     |    1 +
+ fs/btrfs/compression.h     |    6 +-
+ fs/btrfs/ctree.h           |    1 +
+ fs/btrfs/disk-io.c         |    2 +
+ fs/btrfs/ioctl.c           |    6 +-
+ fs/btrfs/props.c           |    6 +
+ fs/btrfs/super.c           |   12 +-
+ fs/btrfs/sysfs.c           |    2 +
+ fs/btrfs/zstd.c            |  433 ++++++
+ fs/squashfs/Kconfig        |   14 +
+ fs/squashfs/Makefile       |    1 +
+ fs/squashfs/decompressor.c |    7 +
+ fs/squashfs/decompressor.h |    4 +
+ fs/squashfs/squashfs_fs.h  |    1 +
+ fs/squashfs/zstd_wrapper.c |  150 ++
+ include/linux/xxhash.h     |  236 +++
+ include/linux/zstd.h       | 1157 +++++++++++++++
+ include/uapi/linux/btrfs.h |    8 +-
+ lib/Kconfig                |   11 +
+ lib/Makefile               |    3 +
+ lib/xxhash.c               |  500 +++++++
+ lib/zstd/Makefile          |   18 +
+ lib/zstd/bitstream.h       |  374 +++++
+ lib/zstd/compress.c        | 3479 ++++++++++++++++++++++++++++++++++++++++++++
+ lib/zstd/decompress.c      | 2526 ++++++++++++++++++++++++++++++++
+ lib/zstd/entropy_common.c  |  243 ++++
+ lib/zstd/error_private.h   |   53 +
+ lib/zstd/fse.h             |  575 ++++++++
+ lib/zstd/fse_compress.c    |  795 ++++++++++
+ lib/zstd/fse_decompress.c  |  332 +++++
+ lib/zstd/huf.h             |  212 +++
+ lib/zstd/huf_compress.c    |  771 ++++++++++
+ lib/zstd/huf_decompress.c  |  960 ++++++++++++
+ lib/zstd/mem.h             |  151 ++
+ lib/zstd/zstd_common.c     |   75 +
+ lib/zstd/zstd_internal.h   |  269 ++++
+ lib/zstd/zstd_opt.h        | 1014 +++++++++++++
+ 39 files changed, 14400 insertions(+), 12 deletions(-)
+ create mode 100644 fs/btrfs/zstd.c
+ create mode 100644 fs/squashfs/zstd_wrapper.c
+ create mode 100644 include/linux/xxhash.h
+ create mode 100644 include/linux/zstd.h
+ create mode 100644 lib/xxhash.c
+ create mode 100644 lib/zstd/Makefile
+ create mode 100644 lib/zstd/bitstream.h
+ create mode 100644 lib/zstd/compress.c
+ create mode 100644 lib/zstd/decompress.c
+ create mode 100644 lib/zstd/entropy_common.c
+ create mode 100644 lib/zstd/error_private.h
+ create mode 100644 lib/zstd/fse.h
+ create mode 100644 lib/zstd/fse_compress.c
+ create mode 100644 lib/zstd/fse_decompress.c
+ create mode 100644 lib/zstd/huf.h
+ create mode 100644 lib/zstd/huf_compress.c
+ create mode 100644 lib/zstd/huf_decompress.c
+ create mode 100644 lib/zstd/mem.h
+ create mode 100644 lib/zstd/zstd_common.c
+ create mode 100644 lib/zstd/zstd_internal.h
+ create mode 100644 lib/zstd/zstd_opt.h
+
+--
+2.9.3
diff --git a/contrib/linux-kernel/0001-lib-Add-xxhash-module.patch b/contrib/linux-kernel/0001-lib-Add-xxhash-module.patch
index 9a8f50a..84a2c53 100644
--- a/contrib/linux-kernel/0001-lib-Add-xxhash-module.patch
+++ b/contrib/linux-kernel/0001-lib-Add-xxhash-module.patch
@@ -1,7 +1,7 @@
-From e75beb7c2e05550b2846e31ad8a0082c188504da Mon Sep 17 00:00:00 2001
+From 5ac909c415ab4a18fd90794793c96e450795e8c6 Mon Sep 17 00:00:00 2001
 From: Nick Terrell <terrelln@fb.com>
-Date: Wed, 21 Jun 2017 17:27:42 -0700
-Subject: [PATCH 1/4] lib: Add xxhash module
+Date: Wed, 21 Jun 2017 17:37:36 -0700
+Subject: [PATCH v2 1/4] lib: Add xxhash module
 
 Adds xxhash kernel module with xxh32 and xxh64 hashes. xxhash is an
 extremely fast non-cryptographic hash algorithm for checksumming.
@@ -73,6 +73,9 @@
 
 Signed-off-by: Nick Terrell <terrelln@fb.com>
 ---
+v1 -> v2:
+- Make pointer in lib/xxhash.c:394 non-const
+
  include/linux/xxhash.h | 236 +++++++++++++++++++++++
  lib/Kconfig            |   3 +
  lib/Makefile           |   1 +
@@ -330,7 +333,7 @@
 @@ -184,6 +184,9 @@ config CRC8
  	  when they need to do cyclic redundancy check according CRC8
  	  algorithm. Module will be called crc8.
- 
+
 +config XXHASH
 +	tristate
 +
@@ -347,11 +350,11 @@
  obj-$(CONFIG_CRC8)	+= crc8.o
 +obj-$(CONFIG_XXHASH)	+= xxhash.o
  obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
- 
+
  obj-$(CONFIG_842_COMPRESS) += 842/
 diff --git a/lib/xxhash.c b/lib/xxhash.c
 new file mode 100644
-index 0000000..dc94904
+index 0000000..aa61e2a
 --- /dev/null
 +++ b/lib/xxhash.c
 @@ -0,0 +1,500 @@
@@ -748,7 +751,7 @@
 +	}
 +
 +	if (state->memsize) { /* tmp buffer is full */
-+		const uint64_t *p64 = state->mem64;
++		uint64_t *p64 = state->mem64;
 +
 +		memcpy(((uint8_t *)p64) + state->memsize, input,
 +			32 - state->memsize);
@@ -855,6 +858,5 @@
 +
 +MODULE_LICENSE("Dual BSD/GPL");
 +MODULE_DESCRIPTION("xxHash");
--- 
+--
 2.9.3
-
diff --git a/contrib/linux-kernel/0002-lib-Add-zstd-modules.patch b/contrib/linux-kernel/0002-lib-Add-zstd-modules.patch
index f94afe3..9710939 100644
--- a/contrib/linux-kernel/0002-lib-Add-zstd-modules.patch
+++ b/contrib/linux-kernel/0002-lib-Add-zstd-modules.patch
@@ -1,7 +1,7 @@
-From b52ae824ae6c0f7c7786380b34da9daaa54bfc26 Mon Sep 17 00:00:00 2001
+From d2626127c6d6e60e940dd9a3ed58323bdcdc4930 Mon Sep 17 00:00:00 2001
 From: Nick Terrell <terrelln@fb.com>
-Date: Wed, 21 Jun 2017 17:31:24 -0700
-Subject: [PATCH 2/4] lib: Add zstd modules
+Date: Tue, 16 May 2017 14:55:36 -0700
+Subject: [PATCH v2 2/4] lib: Add zstd modules
 
 Add zstd compression and decompression kernel modules.
 zstd offers a wide varity of compression speed and quality trade-offs.
@@ -102,26 +102,34 @@
 
 Signed-off-by: Nick Terrell <terrelln@fb.com>
 ---
+v1 -> v2:
+- Use div_u64() for division of u64s
+- Reduce stack usage of ZSTD_compressSequences(), ZSTD_buildSeqTable(),
+  ZSTD_decompressSequencesLong(), FSE_buildDTable(), FSE_decompress_wksp(),
+  HUF_writeCTable(), HUF_readStats(), HUF_readCTable(),
+  HUF_compressWeights(), HUF_readDTableX2(), and HUF_readDTableX4()
+- No function uses more than 400 B of stack space
+
  include/linux/zstd.h      | 1157 +++++++++++++++
  lib/Kconfig               |    8 +
  lib/Makefile              |    2 +
  lib/zstd/Makefile         |   18 +
  lib/zstd/bitstream.h      |  374 +++++
- lib/zstd/compress.c       | 3468 +++++++++++++++++++++++++++++++++++++++++++++
- lib/zstd/decompress.c     | 2514 ++++++++++++++++++++++++++++++++
- lib/zstd/entropy_common.c |  244 ++++
+ lib/zstd/compress.c       | 3479 +++++++++++++++++++++++++++++++++++++++++++++
+ lib/zstd/decompress.c     | 2526 ++++++++++++++++++++++++++++++++
+ lib/zstd/entropy_common.c |  243 ++++
  lib/zstd/error_private.h  |   53 +
- lib/zstd/fse.h            |  584 ++++++++
- lib/zstd/fse_compress.c   |  857 +++++++++++
- lib/zstd/fse_decompress.c |  313 ++++
- lib/zstd/huf.h            |  203 +++
- lib/zstd/huf_compress.c   |  731 ++++++++++
- lib/zstd/huf_decompress.c |  920 ++++++++++++
+ lib/zstd/fse.h            |  575 ++++++++
+ lib/zstd/fse_compress.c   |  795 +++++++++++
+ lib/zstd/fse_decompress.c |  332 +++++
+ lib/zstd/huf.h            |  212 +++
+ lib/zstd/huf_compress.c   |  771 ++++++++++
+ lib/zstd/huf_decompress.c |  960 +++++++++++++
  lib/zstd/mem.h            |  151 ++
  lib/zstd/zstd_common.c    |   75 +
  lib/zstd/zstd_internal.h  |  269 ++++
  lib/zstd/zstd_opt.h       | 1014 +++++++++++++
- 19 files changed, 12955 insertions(+)
+ 19 files changed, 13014 insertions(+)
  create mode 100644 include/linux/zstd.h
  create mode 100644 lib/zstd/Makefile
  create mode 100644 lib/zstd/bitstream.h
@@ -1741,10 +1749,10 @@
 +#endif /* BITSTREAM_H_MODULE */
 diff --git a/lib/zstd/compress.c b/lib/zstd/compress.c
 new file mode 100644
-index 0000000..1aff542
+index 0000000..d60ab7d
 --- /dev/null
 +++ b/lib/zstd/compress.c
-@@ -0,0 +1,3468 @@
+@@ -0,0 +1,3479 @@
 +/**
 + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
 + * All rights reserved.
@@ -1831,7 +1839,7 @@
 +	FSE_CTable offcodeCTable[FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
 +	FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
 +	FSE_CTable litlengthCTable[FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
-+	unsigned tmpCounters[HUF_WORKSPACE_SIZE_U32];
++	unsigned tmpCounters[HUF_COMPRESS_WORKSPACE_SIZE_U32];
 +};
 +
 +size_t ZSTD_CCtxWorkspaceBound(ZSTD_compressionParameters cParams)
@@ -2334,8 +2342,6 @@
 +{
 +	const int longOffsets = zc->params.cParams.windowLog > STREAM_ACCUMULATOR_MIN;
 +	const seqStore_t *seqStorePtr = &(zc->seqStore);
-+	U32 count[MaxSeq + 1];
-+	S16 norm[MaxSeq + 1];
 +	FSE_CTable *CTable_LitLength = zc->litlengthCTable;
 +	FSE_CTable *CTable_OffsetBits = zc->offcodeCTable;
 +	FSE_CTable *CTable_MatchLength = zc->matchlengthCTable;
@@ -2349,7 +2355,21 @@
 +	BYTE *op = ostart;
 +	size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
 +	BYTE *seqHead;
-+	BYTE scratchBuffer[1 << MAX(MLFSELog, LLFSELog)];
++
++	U32 *count;
++	S16 *norm;
++	U32 *workspace;
++	size_t workspaceSize = sizeof(zc->tmpCounters);
++	{
++		size_t spaceUsed32 = 0;
++		count = (U32 *)zc->tmpCounters + spaceUsed32;
++		spaceUsed32 += MaxSeq + 1;
++		norm = (S16 *)((U32 *)zc->tmpCounters + spaceUsed32);
++		spaceUsed32 += ALIGN(sizeof(S16) * (MaxSeq + 1), sizeof(U32)) >> 2;
++
++		workspace = (U32 *)zc->tmpCounters + spaceUsed32;
++		workspaceSize -= (spaceUsed32 << 2);
++	}
 +
 +	/* Compress literals */
 +	{
@@ -2385,7 +2405,7 @@
 +	/* CTable for Literal Lengths */
 +	{
 +		U32 max = MaxLL;
-+		size_t const mostFrequent = FSE_countFast_wksp(count, &max, llCodeTable, nbSeq, zc->tmpCounters);
++		size_t const mostFrequent = FSE_countFast_wksp(count, &max, llCodeTable, nbSeq, workspace);
 +		if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
 +			*op++ = llCodeTable[0];
 +			FSE_buildCTable_rle(CTable_LitLength, (BYTE)max);
@@ -2393,7 +2413,7 @@
 +		} else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
 +			LLtype = set_repeat;
 +		} else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (LL_defaultNormLog - 1)))) {
-+			FSE_buildCTable_wksp(CTable_LitLength, LL_defaultNorm, MaxLL, LL_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
++			FSE_buildCTable_wksp(CTable_LitLength, LL_defaultNorm, MaxLL, LL_defaultNormLog, workspace, workspaceSize);
 +			LLtype = set_basic;
 +		} else {
 +			size_t nbSeq_1 = nbSeq;
@@ -2409,7 +2429,7 @@
 +					return NCountSize;
 +				op += NCountSize;
 +			}
-+			FSE_buildCTable_wksp(CTable_LitLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
++			FSE_buildCTable_wksp(CTable_LitLength, norm, max, tableLog, workspace, workspaceSize);
 +			LLtype = set_compressed;
 +		}
 +	}
@@ -2417,7 +2437,7 @@
 +	/* CTable for Offsets */
 +	{
 +		U32 max = MaxOff;
-+		size_t const mostFrequent = FSE_countFast_wksp(count, &max, ofCodeTable, nbSeq, zc->tmpCounters);
++		size_t const mostFrequent = FSE_countFast_wksp(count, &max, ofCodeTable, nbSeq, workspace);
 +		if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
 +			*op++ = ofCodeTable[0];
 +			FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max);
@@ -2425,7 +2445,7 @@
 +		} else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
 +			Offtype = set_repeat;
 +		} else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (OF_defaultNormLog - 1)))) {
-+			FSE_buildCTable_wksp(CTable_OffsetBits, OF_defaultNorm, MaxOff, OF_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
++			FSE_buildCTable_wksp(CTable_OffsetBits, OF_defaultNorm, MaxOff, OF_defaultNormLog, workspace, workspaceSize);
 +			Offtype = set_basic;
 +		} else {
 +			size_t nbSeq_1 = nbSeq;
@@ -2441,7 +2461,7 @@
 +					return NCountSize;
 +				op += NCountSize;
 +			}
-+			FSE_buildCTable_wksp(CTable_OffsetBits, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
++			FSE_buildCTable_wksp(CTable_OffsetBits, norm, max, tableLog, workspace, workspaceSize);
 +			Offtype = set_compressed;
 +		}
 +	}
@@ -2449,7 +2469,7 @@
 +	/* CTable for MatchLengths */
 +	{
 +		U32 max = MaxML;
-+		size_t const mostFrequent = FSE_countFast_wksp(count, &max, mlCodeTable, nbSeq, zc->tmpCounters);
++		size_t const mostFrequent = FSE_countFast_wksp(count, &max, mlCodeTable, nbSeq, workspace);
 +		if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
 +			*op++ = *mlCodeTable;
 +			FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max);
@@ -2457,7 +2477,7 @@
 +		} else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
 +			MLtype = set_repeat;
 +		} else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (ML_defaultNormLog - 1)))) {
-+			FSE_buildCTable_wksp(CTable_MatchLength, ML_defaultNorm, MaxML, ML_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
++			FSE_buildCTable_wksp(CTable_MatchLength, ML_defaultNorm, MaxML, ML_defaultNormLog, workspace, workspaceSize);
 +			MLtype = set_basic;
 +		} else {
 +			size_t nbSeq_1 = nbSeq;
@@ -2473,7 +2493,7 @@
 +					return NCountSize;
 +				op += NCountSize;
 +			}
-+			FSE_buildCTable_wksp(CTable_MatchLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
++			FSE_buildCTable_wksp(CTable_MatchLength, norm, max, tableLog, workspace, workspaceSize);
 +			MLtype = set_compressed;
 +		}
 +	}
@@ -4359,14 +4379,13 @@
 +	const BYTE *const dictEnd = dictPtr + dictSize;
 +	short offcodeNCount[MaxOff + 1];
 +	unsigned offcodeMaxValue = MaxOff;
-+	BYTE scratchBuffer[1 << MAX(MLFSELog, LLFSELog)];
 +
 +	dictPtr += 4; /* skip magic number */
 +	cctx->dictID = cctx->params.fParams.noDictIDFlag ? 0 : ZSTD_readLE32(dictPtr);
 +	dictPtr += 4;
 +
 +	{
-+		size_t const hufHeaderSize = HUF_readCTable(cctx->hufTable, 255, dictPtr, dictEnd - dictPtr);
++		size_t const hufHeaderSize = HUF_readCTable_wksp(cctx->hufTable, 255, dictPtr, dictEnd - dictPtr, cctx->tmpCounters, sizeof(cctx->tmpCounters));
 +		if (HUF_isError(hufHeaderSize))
 +			return ERROR(dictionary_corrupted);
 +		dictPtr += hufHeaderSize;
@@ -4380,7 +4399,7 @@
 +		if (offcodeLog > OffFSELog)
 +			return ERROR(dictionary_corrupted);
 +		/* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
-+		CHECK_E(FSE_buildCTable_wksp(cctx->offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog, scratchBuffer, sizeof(scratchBuffer)),
++		CHECK_E(FSE_buildCTable_wksp(cctx->offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog, cctx->tmpCounters, sizeof(cctx->tmpCounters)),
 +			dictionary_corrupted);
 +		dictPtr += offcodeHeaderSize;
 +	}
@@ -4396,7 +4415,7 @@
 +		/* Every match length code must have non-zero probability */
 +		CHECK_F(ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML));
 +		CHECK_E(
-+		    FSE_buildCTable_wksp(cctx->matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, scratchBuffer, sizeof(scratchBuffer)),
++		    FSE_buildCTable_wksp(cctx->matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, cctx->tmpCounters, sizeof(cctx->tmpCounters)),
 +		    dictionary_corrupted);
 +		dictPtr += matchlengthHeaderSize;
 +	}
@@ -4411,7 +4430,7 @@
 +			return ERROR(dictionary_corrupted);
 +		/* Every literal length code must have non-zero probability */
 +		CHECK_F(ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL));
-+		CHECK_E(FSE_buildCTable_wksp(cctx->litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, scratchBuffer, sizeof(scratchBuffer)),
++		CHECK_E(FSE_buildCTable_wksp(cctx->litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, cctx->tmpCounters, sizeof(cctx->tmpCounters)),
 +			dictionary_corrupted);
 +		dictPtr += litlengthHeaderSize;
 +	}
@@ -5215,10 +5234,10 @@
 +MODULE_DESCRIPTION("Zstd Compressor");
 diff --git a/lib/zstd/decompress.c b/lib/zstd/decompress.c
 new file mode 100644
-index 0000000..ec673d7
+index 0000000..62449ae
 --- /dev/null
 +++ b/lib/zstd/decompress.c
-@@ -0,0 +1,2514 @@
+@@ -0,0 +1,2526 @@
 +/**
 + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
 + * All rights reserved.
@@ -5291,6 +5310,7 @@
 +	FSE_DTable OFTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
 +	FSE_DTable MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
 +	HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */
++	U64 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32 / 2];
 +	U32 rep[ZSTD_REP_NUM];
 +} ZSTD_entropyTables_t;
 +
@@ -5704,8 +5724,10 @@
 +					    ? (singleStream ? HUF_decompress1X_usingDTable(dctx->litBuffer, litSize, istart + lhSize, litCSize, dctx->HUFptr)
 +							    : HUF_decompress4X_usingDTable(dctx->litBuffer, litSize, istart + lhSize, litCSize, dctx->HUFptr))
 +					    : (singleStream
-+						   ? HUF_decompress1X2_DCtx(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart + lhSize, litCSize)
-+						   : HUF_decompress4X_hufOnly(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart + lhSize, litCSize))))
++						   ? HUF_decompress1X2_DCtx_wksp(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart + lhSize, litCSize,
++										 dctx->entropy.workspace, sizeof(dctx->entropy.workspace))
++						   : HUF_decompress4X_hufOnly_wksp(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart + lhSize, litCSize,
++										   dctx->entropy.workspace, sizeof(dctx->entropy.workspace)))))
 +					return ERROR(corruption_detected);
 +
 +				dctx->litPtr = dctx->litBuffer;
@@ -5968,7 +5990,7 @@
 +			  or an error code if it fails, testable with ZSTD_isError()
 +*/
 +static size_t ZSTD_buildSeqTable(FSE_DTable *DTableSpace, const FSE_DTable **DTablePtr, symbolEncodingType_e type, U32 max, U32 maxLog, const void *src,
-+				 size_t srcSize, const FSE_decode_t4 *defaultTable, U32 flagRepeatTable)
++				 size_t srcSize, const FSE_decode_t4 *defaultTable, U32 flagRepeatTable, void *workspace, size_t workspaceSize)
 +{
 +	const void *const tmpPtr = defaultTable; /* bypass strict aliasing */
 +	switch (type) {
@@ -5988,15 +6010,23 @@
 +	default: /* impossible */
 +	case set_compressed: {
 +		U32 tableLog;
-+		S16 norm[MaxSeq + 1];
-+		size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
-+		if (FSE_isError(headerSize))
-+			return ERROR(corruption_detected);
-+		if (tableLog > maxLog)
-+			return ERROR(corruption_detected);
-+		FSE_buildDTable(DTableSpace, norm, max, tableLog);
-+		*DTablePtr = DTableSpace;
-+		return headerSize;
++		S16 *norm = (S16 *)workspace;
++		size_t const spaceUsed32 = ALIGN(sizeof(S16) * (MaxSeq + 1), sizeof(U32)) >> 2;
++
++		if ((spaceUsed32 << 2) > workspaceSize)
++			return ERROR(GENERIC);
++		workspace = (U32 *)workspace + spaceUsed32;
++		workspaceSize -= (spaceUsed32 << 2);
++		{
++			size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
++			if (FSE_isError(headerSize))
++				return ERROR(corruption_detected);
++			if (tableLog > maxLog)
++				return ERROR(corruption_detected);
++			FSE_buildDTable_wksp(DTableSpace, norm, max, tableLog, workspace, workspaceSize);
++			*DTablePtr = DTableSpace;
++			return headerSize;
++		}
 +	}
 +	}
 +}
@@ -6044,21 +6074,21 @@
 +		/* Build DTables */
 +		{
 +			size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr, LLtype, MaxLL, LLFSELog, ip, iend - ip,
-+								  LL_defaultDTable, dctx->fseEntropy);
++								  LL_defaultDTable, dctx->fseEntropy, dctx->entropy.workspace, sizeof(dctx->entropy.workspace));
 +			if (ZSTD_isError(llhSize))
 +				return ERROR(corruption_detected);
 +			ip += llhSize;
 +		}
 +		{
 +			size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr, OFtype, MaxOff, OffFSELog, ip, iend - ip,
-+								  OF_defaultDTable, dctx->fseEntropy);
++								  OF_defaultDTable, dctx->fseEntropy, dctx->entropy.workspace, sizeof(dctx->entropy.workspace));
 +			if (ZSTD_isError(ofhSize))
 +				return ERROR(corruption_detected);
 +			ip += ofhSize;
 +		}
 +		{
 +			size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr, MLtype, MaxML, MLFSELog, ip, iend - ip,
-+								  ML_defaultDTable, dctx->fseEntropy);
++								  ML_defaultDTable, dctx->fseEntropy, dctx->entropy.workspace, sizeof(dctx->entropy.workspace));
 +			if (ZSTD_isError(mlhSize))
 +				return ERROR(corruption_detected);
 +			ip += mlhSize;
@@ -6581,10 +6611,11 @@
 +#define STORED_SEQS 4
 +#define STOSEQ_MASK (STORED_SEQS - 1)
 +#define ADVANCED_SEQS 4
-+		seq_t sequences[STORED_SEQS];
++		seq_t *sequences = (seq_t *)dctx->entropy.workspace;
 +		int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
 +		seqState_t seqState;
 +		int seqNb;
++		ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.workspace) >= sizeof(seq_t) * STORED_SEQS);
 +		dctx->fseEntropy = 1;
 +		{
 +			U32 i;
@@ -7087,7 +7118,7 @@
 +	dictPtr += 8; /* skip header = magic + dictID */
 +
 +	{
-+		size_t const hSize = HUF_readDTableX4(entropy->hufTable, dictPtr, dictEnd - dictPtr);
++		size_t const hSize = HUF_readDTableX4_wksp(entropy->hufTable, dictPtr, dictEnd - dictPtr, entropy->workspace, sizeof(entropy->workspace));
 +		if (HUF_isError(hSize))
 +			return ERROR(dictionary_corrupted);
 +		dictPtr += hSize;
@@ -7101,7 +7132,7 @@
 +			return ERROR(dictionary_corrupted);
 +		if (offcodeLog > OffFSELog)
 +			return ERROR(dictionary_corrupted);
-+		CHECK_E(FSE_buildDTable(entropy->OFTable, offcodeNCount, offcodeMaxValue, offcodeLog), dictionary_corrupted);
++		CHECK_E(FSE_buildDTable_wksp(entropy->OFTable, offcodeNCount, offcodeMaxValue, offcodeLog, entropy->workspace, sizeof(entropy->workspace)), dictionary_corrupted);
 +		dictPtr += offcodeHeaderSize;
 +	}
 +
@@ -7113,7 +7144,7 @@
 +			return ERROR(dictionary_corrupted);
 +		if (matchlengthLog > MLFSELog)
 +			return ERROR(dictionary_corrupted);
-+		CHECK_E(FSE_buildDTable(entropy->MLTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog), dictionary_corrupted);
++		CHECK_E(FSE_buildDTable_wksp(entropy->MLTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, entropy->workspace, sizeof(entropy->workspace)), dictionary_corrupted);
 +		dictPtr += matchlengthHeaderSize;
 +	}
 +
@@ -7125,7 +7156,7 @@
 +			return ERROR(dictionary_corrupted);
 +		if (litlengthLog > LLFSELog)
 +			return ERROR(dictionary_corrupted);
-+		CHECK_E(FSE_buildDTable(entropy->LLTable, litlengthNCount, litlengthMaxValue, litlengthLog), dictionary_corrupted);
++		CHECK_E(FSE_buildDTable_wksp(entropy->LLTable, litlengthNCount, litlengthMaxValue, litlengthLog, entropy->workspace, sizeof(entropy->workspace)), dictionary_corrupted);
 +		dictPtr += litlengthHeaderSize;
 +	}
 +
@@ -7735,10 +7766,10 @@
 +MODULE_DESCRIPTION("Zstd Decompressor");
 diff --git a/lib/zstd/entropy_common.c b/lib/zstd/entropy_common.c
 new file mode 100644
-index 0000000..b354fc2
+index 0000000..2b0a643
 --- /dev/null
 +++ b/lib/zstd/entropy_common.c
-@@ -0,0 +1,244 @@
+@@ -0,0 +1,243 @@
 +/*
 + * Common functions of New Generation Entropy library
 + * Copyright (C) 2016, Yann Collet.
@@ -7905,7 +7936,7 @@
 +	@return : size read from `src` , or an error Code .
 +	Note : Needed by HUF_readCTable() and HUF_readDTableX?() .
 +*/
-+size_t HUF_readStats(BYTE *huffWeight, size_t hwSize, U32 *rankStats, U32 *nbSymbolsPtr, U32 *tableLogPtr, const void *src, size_t srcSize)
++size_t HUF_readStats_wksp(BYTE *huffWeight, size_t hwSize, U32 *rankStats, U32 *nbSymbolsPtr, U32 *tableLogPtr, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
 +{
 +	U32 weightTotal;
 +	const BYTE *ip = (const BYTE *)src;
@@ -7933,10 +7964,9 @@
 +			}
 +		}
 +	} else {						 /* header compressed with FSE (normal case) */
-+		FSE_DTable fseWorkspace[FSE_DTABLE_SIZE_U32(6)]; /* 6 is max possible tableLog for HUF header (maybe even 5, to be tested) */
 +		if (iSize + 1 > srcSize)
 +			return ERROR(srcSize_wrong);
-+		oSize = FSE_decompress_wksp(huffWeight, hwSize - 1, ip + 1, iSize, fseWorkspace, 6); /* max (hwSize-1) values decoded, as last one is implied */
++		oSize = FSE_decompress_wksp(huffWeight, hwSize - 1, ip + 1, iSize, 6, workspace, workspaceSize); /* max (hwSize-1) values decoded, as last one is implied */
 +		if (FSE_isError(oSize))
 +			return oSize;
 +	}
@@ -8044,10 +8074,10 @@
 +#endif /* ERROR_H_MODULE */
 diff --git a/lib/zstd/fse.h b/lib/zstd/fse.h
 new file mode 100644
-index 0000000..bc2962a
+index 0000000..7460ab0
 --- /dev/null
 +++ b/lib/zstd/fse.h
-@@ -0,0 +1,584 @@
+@@ -0,0 +1,575 @@
 +/*
 + * FSE : Finite State Entropy codec
 + * Public Prototypes declaration
@@ -8237,7 +8267,7 @@
 +/*! FSE_buildDTable():
 +	Builds 'dt', which must be already allocated, using FSE_createDTable().
 +	return : 0, or an errorCode, which can be tested using FSE_isError() */
-+FSE_PUBLIC_API size_t FSE_buildDTable(FSE_DTable *dt, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
++FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable *dt, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize);
 +
 +/*! FSE_decompress_usingDTable():
 +	Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
@@ -8313,15 +8343,6 @@
 +unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
 +/**< same as FSE_optimalTableLog(), which used `minus==2` */
 +
-+/* FSE_compress_wksp() :
-+ * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
-+ * FSE_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
-+ */
-+#define FSE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) \
-+	(FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024))
-+size_t FSE_compress_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
-+			 size_t wkspSize);
-+
 +size_t FSE_buildCTable_raw(FSE_CTable *ct, unsigned nbBits);
 +/**< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
 +
@@ -8340,7 +8361,7 @@
 +size_t FSE_buildDTable_rle(FSE_DTable *dt, unsigned char symbolValue);
 +/**< build a fake FSE_DTable, designed to always generate the same symbolValue */
 +
-+size_t FSE_decompress_wksp(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, FSE_DTable *workSpace, unsigned maxLog);
++size_t FSE_decompress_wksp(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, unsigned maxLog, void *workspace, size_t workspaceSize);
 +/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DTABLE_SIZE_U32(maxLog)` */
 +
 +/* *****************************************
@@ -8634,10 +8655,10 @@
 +#endif /* FSE_H */
 diff --git a/lib/zstd/fse_compress.c b/lib/zstd/fse_compress.c
 new file mode 100644
-index 0000000..e016bb1
+index 0000000..ef3d174
 --- /dev/null
 +++ b/lib/zstd/fse_compress.c
-@@ -0,0 +1,857 @@
+@@ -0,0 +1,795 @@
 +/*
 + * FSE : Finite State Entropy encoder
 + * Copyright (C) 2013-2015, Yann Collet.
@@ -8688,6 +8709,8 @@
 +#include "bitstream.h"
 +#include "fse.h"
 +#include <linux/compiler.h>
++#include <linux/kernel.h>
++#include <linux/math64.h>
 +#include <linux/string.h> /* memcpy, memset */
 +
 +/* **************************************************************
@@ -8727,7 +8750,7 @@
 + * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
 + * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
 + */
-+size_t FSE_buildCTable_wksp(FSE_CTable *ct, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workSpace, size_t wkspSize)
++size_t FSE_buildCTable_wksp(FSE_CTable *ct, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize)
 +{
 +	U32 const tableSize = 1 << tableLog;
 +	U32 const tableMask = tableSize - 1;
@@ -8736,14 +8759,23 @@
 +	void *const FSCT = ((U32 *)ptr) + 1 /* header */ + (tableLog ? tableSize >> 1 : 1);
 +	FSE_symbolCompressionTransform *const symbolTT = (FSE_symbolCompressionTransform *)(FSCT);
 +	U32 const step = FSE_TABLESTEP(tableSize);
-+	U32 cumul[FSE_MAX_SYMBOL_VALUE + 2];
-+
-+	FSE_FUNCTION_TYPE *const tableSymbol = (FSE_FUNCTION_TYPE *)workSpace;
 +	U32 highThreshold = tableSize - 1;
 +
-+	/* CTable header */
-+	if (((size_t)1 << tableLog) * sizeof(FSE_FUNCTION_TYPE) > wkspSize)
++	U32 *cumul;
++	FSE_FUNCTION_TYPE *tableSymbol;
++	size_t spaceUsed32 = 0;
++
++	cumul = (U32 *)workspace + spaceUsed32;
++	spaceUsed32 += FSE_MAX_SYMBOL_VALUE + 2;
++	tableSymbol = (FSE_FUNCTION_TYPE *)((U32 *)workspace + spaceUsed32);
++	spaceUsed32 += ALIGN(sizeof(FSE_FUNCTION_TYPE) * ((size_t)1 << tableLog), sizeof(U32)) >> 2;
++
++	if ((spaceUsed32 << 2) > workspaceSize)
 +		return ERROR(tableLog_tooLarge);
++	workspace = (U32 *)workspace + spaceUsed32;
++	workspaceSize -= (spaceUsed32 << 2);
++
++	/* CTable header */
 +	tableU16[-2] = (U16)tableLog;
 +	tableU16[-1] = (U16)maxSymbolValue;
 +
@@ -9215,7 +9247,7 @@
 +	{
 +		U64 const vStepLog = 62 - tableLog;
 +		U64 const mid = (1ULL << (vStepLog - 1)) - 1;
-+		U64 const rStep = ((((U64)1 << vStepLog) * ToDistribute) + mid) / total; /* scale on remaining */
++		U64 const rStep = div_u64((((U64)1 << vStepLog) * ToDistribute) + mid, (U32)total); /* scale on remaining */
 +		U64 tmpTotal = mid;
 +		for (s = 0; s <= maxSymbolValue; s++) {
 +			if (norm[s] == NOT_YET_ASSIGNED) {
@@ -9249,7 +9281,7 @@
 +	{
 +		U32 const rtbTable[] = {0, 473195, 504333, 520860, 550000, 700000, 750000, 830000};
 +		U64 const scale = 62 - tableLog;
-+		U64 const step = ((U64)1 << 62) / total; /* <== here, one division ! */
++		U64 const step = div_u64((U64)1 << 62, (U32)total); /* <== here, one division ! */
 +		U64 const vStep = 1ULL << (scale - 20);
 +		int stillToDistribute = 1 << tableLog;
 +		unsigned s;
@@ -9422,85 +9454,12 @@
 +}
 +
 +size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
-+
-+#define CHECK_V_F(e, f)     \
-+	size_t const e = f; \
-+	if (ERR_isError(e)) \
-+	return f
-+#define CHECK_F(f)                        \
-+	{                                 \
-+		CHECK_V_F(_var_err__, f); \
-+	}
-+
-+/* FSE_compress_wksp() :
-+ * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
-+ * `wkspSize` size must be `(1<<tableLog)`.
-+ */
-+size_t FSE_compress_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
-+			 size_t wkspSize)
-+{
-+	BYTE *const ostart = (BYTE *)dst;
-+	BYTE *op = ostart;
-+	BYTE *const oend = ostart + dstSize;
-+
-+	U32 count[FSE_MAX_SYMBOL_VALUE + 1];
-+	S16 norm[FSE_MAX_SYMBOL_VALUE + 1];
-+	FSE_CTable *CTable = (FSE_CTable *)workSpace;
-+	size_t const CTableSize = FSE_CTABLE_SIZE_U32(tableLog, maxSymbolValue);
-+	void *scratchBuffer = (void *)(CTable + CTableSize);
-+	size_t const scratchBufferSize = wkspSize - (CTableSize * sizeof(FSE_CTable));
-+
-+	/* init conditions */
-+	if (wkspSize < FSE_WKSP_SIZE_U32(tableLog, maxSymbolValue))
-+		return ERROR(tableLog_tooLarge);
-+	if (srcSize <= 1)
-+		return 0; /* Not compressible */
-+	if (!maxSymbolValue)
-+		maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
-+	if (!tableLog)
-+		tableLog = FSE_DEFAULT_TABLELOG;
-+
-+	/* Scan input and build symbol stats */
-+	{
-+		CHECK_V_F(maxCount, FSE_count_wksp(count, &maxSymbolValue, src, srcSize, (unsigned *)scratchBuffer));
-+		if (maxCount == srcSize)
-+			return 1; /* only a single symbol in src : rle */
-+		if (maxCount == 1)
-+			return 0; /* each symbol present maximum once => not compressible */
-+		if (maxCount < (srcSize >> 7))
-+			return 0; /* Heuristic : not compressible enough */
-+	}
-+
-+	tableLog = FSE_optimalTableLog(tableLog, srcSize, maxSymbolValue);
-+	CHECK_F(FSE_normalizeCount(norm, tableLog, count, srcSize, maxSymbolValue));
-+
-+	/* Write table description header */
-+	{
-+		CHECK_V_F(nc_err, FSE_writeNCount(op, oend - op, norm, maxSymbolValue, tableLog));
-+		op += nc_err;
-+	}
-+
-+	/* Compress */
-+	CHECK_F(FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, scratchBufferSize));
-+	{
-+		CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, src, srcSize, CTable));
-+		if (cSize == 0)
-+			return 0; /* not enough space for compressed data */
-+		op += cSize;
-+	}
-+
-+	/* check compressibility */
-+	if ((size_t)(op - ostart) >= srcSize - 1)
-+		return 0;
-+
-+	return op - ostart;
-+}
 diff --git a/lib/zstd/fse_decompress.c b/lib/zstd/fse_decompress.c
 new file mode 100644
-index 0000000..96cf89f
+index 0000000..a84300e
 --- /dev/null
 +++ b/lib/zstd/fse_decompress.c
-@@ -0,0 +1,313 @@
+@@ -0,0 +1,332 @@
 +/*
 + * FSE : Finite State Entropy decoder
 + * Copyright (C) 2013-2015, Yann Collet.
@@ -9551,6 +9510,7 @@
 +#include "bitstream.h"
 +#include "fse.h"
 +#include <linux/compiler.h>
++#include <linux/kernel.h>
 +#include <linux/string.h> /* memcpy, memset */
 +
 +/* **************************************************************
@@ -9594,17 +9554,19 @@
 +
 +/* Function templates */
 +
-+size_t FSE_buildDTable(FSE_DTable *dt, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
++size_t FSE_buildDTable_wksp(FSE_DTable *dt, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize)
 +{
 +	void *const tdPtr = dt + 1; /* because *dt is unsigned, 32-bits aligned on 32-bits */
 +	FSE_DECODE_TYPE *const tableDecode = (FSE_DECODE_TYPE *)(tdPtr);
-+	U16 symbolNext[FSE_MAX_SYMBOL_VALUE + 1];
++	U16 *symbolNext = (U16 *)workspace;
 +
 +	U32 const maxSV1 = maxSymbolValue + 1;
 +	U32 const tableSize = 1 << tableLog;
 +	U32 highThreshold = tableSize - 1;
 +
 +	/* Sanity Checks */
++	if (workspaceSize < sizeof(U16) * (FSE_MAX_SYMBOL_VALUE + 1))
++		return ERROR(tableLog_tooLarge);
 +	if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE)
 +		return ERROR(maxSymbolValue_tooLarge);
 +	if (tableLog > FSE_MAX_TABLELOG)
@@ -9791,16 +9753,32 @@
 +	return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
 +}
 +
-+size_t FSE_decompress_wksp(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, FSE_DTable *workSpace, unsigned maxLog)
++size_t FSE_decompress_wksp(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, unsigned maxLog, void *workspace, size_t workspaceSize)
 +{
 +	const BYTE *const istart = (const BYTE *)cSrc;
 +	const BYTE *ip = istart;
-+	short counting[FSE_MAX_SYMBOL_VALUE + 1];
 +	unsigned tableLog;
 +	unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
++	size_t NCountLength;
++
++	FSE_DTable *dt;
++	short *counting;
++	size_t spaceUsed32 = 0;
++
++	FSE_STATIC_ASSERT(sizeof(FSE_DTable) == sizeof(U32));
++
++	dt = (FSE_DTable *)((U32 *)workspace + spaceUsed32);
++	spaceUsed32 += FSE_DTABLE_SIZE_U32(maxLog);
++	counting = (short *)((U32 *)workspace + spaceUsed32);
++	spaceUsed32 += ALIGN(sizeof(short) * (FSE_MAX_SYMBOL_VALUE + 1), sizeof(U32)) >> 2;
++
++	if ((spaceUsed32 << 2) > workspaceSize)
++		return ERROR(tableLog_tooLarge);
++	workspace = (U32 *)workspace + spaceUsed32;
++	workspaceSize -= (spaceUsed32 << 2);
 +
 +	/* normal FSE decoding mode */
-+	size_t const NCountLength = FSE_readNCount(counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
++	NCountLength = FSE_readNCount(counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
 +	if (FSE_isError(NCountLength))
 +		return NCountLength;
 +	// if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size; supposed to be already checked in NCountLength, only remaining
@@ -9810,16 +9788,16 @@
 +	ip += NCountLength;
 +	cSrcSize -= NCountLength;
 +
-+	CHECK_F(FSE_buildDTable(workSpace, counting, maxSymbolValue, tableLog));
++	CHECK_F(FSE_buildDTable_wksp(dt, counting, maxSymbolValue, tableLog, workspace, workspaceSize));
 +
-+	return FSE_decompress_usingDTable(dst, dstCapacity, ip, cSrcSize, workSpace); /* always return, even if it is an error code */
++	return FSE_decompress_usingDTable(dst, dstCapacity, ip, cSrcSize, dt); /* always return, even if it is an error code */
 +}
 diff --git a/lib/zstd/huf.h b/lib/zstd/huf.h
 new file mode 100644
-index 0000000..56abe2f
+index 0000000..2143da2
 --- /dev/null
 +++ b/lib/zstd/huf.h
-@@ -0,0 +1,203 @@
+@@ -0,0 +1,212 @@
 +/*
 + * Huffman coder, part of New Generation Entropy library
 + * header file
@@ -9877,7 +9855,7 @@
 +/** HUF_compress4X_wksp() :
 +*   Same as HUF_compress2(), but uses externally allocated `workSpace`, which must be a table of >= 1024 unsigned */
 +size_t HUF_compress4X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
-+			   size_t wkspSize); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
++			   size_t wkspSize); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
 +
 +/* *** Dependencies *** */
 +#include "mem.h" /* U32 */
@@ -9913,17 +9891,23 @@
 +#define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = {((U32)(maxTableLog)*0x01000001)}
 +
 +/* The workspace must have alignment at least 4 and be at least this large */
-+#define HUF_WORKSPACE_SIZE (6 << 10)
-+#define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32))
++#define HUF_COMPRESS_WORKSPACE_SIZE (6 << 10)
++#define HUF_COMPRESS_WORKSPACE_SIZE_U32 (HUF_COMPRESS_WORKSPACE_SIZE / sizeof(U32))
++
++/* The workspace must have alignment at least 4 and be at least this large */
++#define HUF_DECOMPRESS_WORKSPACE_SIZE (3 << 10)
++#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
 +
 +/* ****************************************
 +*  Advanced decompression functions
 +******************************************/
-+size_t HUF_decompress4X_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize); /**< decodes RLE and uncompressed */
-+size_t HUF_decompress4X_hufOnly(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc,
-+				size_t cSrcSize);							       /**< considers RLE and uncompressed as errors */
-+size_t HUF_decompress4X2_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize); /**< single-symbol decoder */
-+size_t HUF_decompress4X4_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize); /**< double-symbols decoder */
++size_t HUF_decompress4X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize); /**< decodes RLE and uncompressed */
++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
++				size_t workspaceSize);							       /**< considers RLE and uncompressed as errors */
++size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
++				   size_t workspaceSize); /**< single-symbol decoder */
++size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
++				   size_t workspaceSize); /**< double-symbols decoder */
 +
 +/* ****************************************
 +*  HUF detailed API
@@ -9933,7 +9917,7 @@
 +1. count symbol occurrence from source[] into table count[] using FSE_count()
 +2. (optional) refine tableLog using HUF_optimalTableLog()
 +3. build Huffman table from count using HUF_buildCTable()
-+4. save Huffman table to memory buffer using HUF_writeCTable()
++4. save Huffman table to memory buffer using HUF_writeCTable_wksp()
 +5. encode the data stream using HUF_compress4X_usingCTable()
 +
 +The following API allows targeting specific sub-functions for advanced tasks.
@@ -9943,7 +9927,7 @@
 +/* FSE_count() : find it within "fse.h" */
 +unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
 +typedef struct HUF_CElt_s HUF_CElt; /* incomplete type */
-+size_t HUF_writeCTable(void *dst, size_t maxDstSize, const HUF_CElt *CTable, unsigned maxSymbolValue, unsigned huffLog);
++size_t HUF_writeCTable_wksp(void *dst, size_t maxDstSize, const HUF_CElt *CTable, unsigned maxSymbolValue, unsigned huffLog, void *workspace, size_t workspaceSize);
 +size_t HUF_compress4X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable);
 +
 +typedef enum {
@@ -9959,7 +9943,7 @@
 +*   If preferRepeat then the old table will always be used if valid. */
 +size_t HUF_compress4X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
 +			     size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat,
-+			     int preferRepeat); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
++			     int preferRepeat); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
 +
 +/** HUF_buildCTable_wksp() :
 + *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
@@ -9972,11 +9956,12 @@
 +	`huffWeight` is destination buffer.
 +	@return : size read from `src` , or an error Code .
 +	Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */
-+size_t HUF_readStats(BYTE *huffWeight, size_t hwSize, U32 *rankStats, U32 *nbSymbolsPtr, U32 *tableLogPtr, const void *src, size_t srcSize);
++size_t HUF_readStats_wksp(BYTE *huffWeight, size_t hwSize, U32 *rankStats, U32 *nbSymbolsPtr, U32 *tableLogPtr, const void *src, size_t srcSize,
++			  void *workspace, size_t workspaceSize);
 +
 +/** HUF_readCTable() :
 +*   Loading a CTable saved with HUF_writeCTable() */
-+size_t HUF_readCTable(HUF_CElt *CTable, unsigned maxSymbolValue, const void *src, size_t srcSize);
++size_t HUF_readCTable_wksp(HUF_CElt *CTable, unsigned maxSymbolValue, const void *src, size_t srcSize, void *workspace, size_t workspaceSize);
 +
 +/*
 +HUF_decompress() does the following:
@@ -9992,8 +9977,8 @@
 +*   Assumption : 0 < cSrcSize < dstSize <= 128 KB */
 +U32 HUF_selectDecoder(size_t dstSize, size_t cSrcSize);
 +
-+size_t HUF_readDTableX2(HUF_DTable *DTable, const void *src, size_t srcSize);
-+size_t HUF_readDTableX4(HUF_DTable *DTable, const void *src, size_t srcSize);
++size_t HUF_readDTableX2_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize);
++size_t HUF_readDTableX4_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize);
 +
 +size_t HUF_decompress4X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
 +size_t HUF_decompress4X2_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
@@ -10002,7 +9987,7 @@
 +/* single stream variants */
 +
 +size_t HUF_compress1X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
-+			   size_t wkspSize); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
++			   size_t wkspSize); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
 +size_t HUF_compress1X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable);
 +/** HUF_compress1X_repeat() :
 +*   Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
@@ -10011,11 +9996,13 @@
 +*   If preferRepeat then the old table will always be used if valid. */
 +size_t HUF_compress1X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
 +			     size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat,
-+			     int preferRepeat); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
++			     int preferRepeat); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
 +
-+size_t HUF_decompress1X_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize);
-+size_t HUF_decompress1X2_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize); /**< single-symbol decoder */
-+size_t HUF_decompress1X4_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize); /**< double-symbols decoder */
++size_t HUF_decompress1X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize);
++size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
++				   size_t workspaceSize); /**< single-symbol decoder */
++size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
++				   size_t workspaceSize); /**< double-symbols decoder */
 +
 +size_t HUF_decompress1X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize,
 +				    const HUF_DTable *DTable); /**< automatic selection of sing or double symbol decoder, based on DTable */
@@ -10025,10 +10012,10 @@
 +#endif /* HUF_H_298734234 */
 diff --git a/lib/zstd/huf_compress.c b/lib/zstd/huf_compress.c
 new file mode 100644
-index 0000000..e82a136
+index 0000000..0361f38
 --- /dev/null
 +++ b/lib/zstd/huf_compress.c
-@@ -0,0 +1,731 @@
+@@ -0,0 +1,771 @@
 +/*
 + * Huffman encoder, part of New Generation Entropy library
 + * Copyright (C) 2013-2016, Yann Collet.
@@ -10074,6 +10061,7 @@
 +#include "bitstream.h"
 +#include "fse.h" /* header compression */
 +#include "huf.h"
++#include <linux/kernel.h>
 +#include <linux/string.h> /* memcpy, memset */
 +
 +/* **************************************************************
@@ -10109,7 +10097,7 @@
 + * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX.
 + */
 +#define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6
-+size_t HUF_compressWeights(void *dst, size_t dstSize, const void *weightTable, size_t wtSize)
++size_t HUF_compressWeights_wksp(void *dst, size_t dstSize, const void *weightTable, size_t wtSize, void *workspace, size_t workspaceSize)
 +{
 +	BYTE *const ostart = (BYTE *)dst;
 +	BYTE *op = ostart;
@@ -10118,11 +10106,24 @@
 +	U32 maxSymbolValue = HUF_TABLELOG_MAX;
 +	U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
 +
-+	FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)];
-+	BYTE scratchBuffer[1 << MAX_FSE_TABLELOG_FOR_HUFF_HEADER];
++	FSE_CTable *CTable;
++	U32 *count;
++	S16 *norm;
++	size_t spaceUsed32 = 0;
 +
-+	U32 count[HUF_TABLELOG_MAX + 1];
-+	S16 norm[HUF_TABLELOG_MAX + 1];
++	HUF_STATIC_ASSERT(sizeof(FSE_CTable) == sizeof(U32));
++
++	CTable = (FSE_CTable *)((U32 *)workspace + spaceUsed32);
++	spaceUsed32 += FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX);
++	count = (U32 *)workspace + spaceUsed32;
++	spaceUsed32 += HUF_TABLELOG_MAX + 1;
++	norm = (S16 *)((U32 *)workspace + spaceUsed32);
++	spaceUsed32 += ALIGN(sizeof(S16) * (HUF_TABLELOG_MAX + 1), sizeof(U32)) >> 2;
++
++	if ((spaceUsed32 << 2) > workspaceSize)
++		return ERROR(tableLog_tooLarge);
++	workspace = (U32 *)workspace + spaceUsed32;
++	workspaceSize -= (spaceUsed32 << 2);
 +
 +	/* init conditions */
 +	if (wtSize <= 1)
@@ -10147,7 +10148,7 @@
 +	}
 +
 +	/* Compress */
-+	CHECK_F(FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, sizeof(scratchBuffer)));
++	CHECK_F(FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, workspace, workspaceSize));
 +	{
 +		CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, weightTable, wtSize, CTable));
 +		if (cSize == 0)
@@ -10163,16 +10164,28 @@
 +	BYTE nbBits;
 +}; /* typedef'd to HUF_CElt within "huf.h" */
 +
-+/*! HUF_writeCTable() :
++/*! HUF_writeCTable_wksp() :
 +	`CTable` : Huffman tree to save, using huf representation.
 +	@return : size of saved CTable */
-+size_t HUF_writeCTable(void *dst, size_t maxDstSize, const HUF_CElt *CTable, U32 maxSymbolValue, U32 huffLog)
++size_t HUF_writeCTable_wksp(void *dst, size_t maxDstSize, const HUF_CElt *CTable, U32 maxSymbolValue, U32 huffLog, void *workspace, size_t workspaceSize)
 +{
-+	BYTE bitsToWeight[HUF_TABLELOG_MAX + 1]; /* precomputed conversion table */
-+	BYTE huffWeight[HUF_SYMBOLVALUE_MAX];
 +	BYTE *op = (BYTE *)dst;
 +	U32 n;
 +
++	BYTE *bitsToWeight;
++	BYTE *huffWeight;
++	size_t spaceUsed32 = 0;
++
++	bitsToWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
++	spaceUsed32 += ALIGN(HUF_TABLELOG_MAX + 1, sizeof(U32)) >> 2;
++	huffWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
++	spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX, sizeof(U32)) >> 2;
++
++	if ((spaceUsed32 << 2) > workspaceSize)
++		return ERROR(tableLog_tooLarge);
++	workspace = (U32 *)workspace + spaceUsed32;
++	workspaceSize -= (spaceUsed32 << 2);
++
 +	/* check conditions */
 +	if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
 +		return ERROR(maxSymbolValue_tooLarge);
@@ -10186,7 +10199,7 @@
 +
 +	/* attempt weights compression by FSE */
 +	{
-+		CHECK_V_F(hSize, HUF_compressWeights(op + 1, maxDstSize - 1, huffWeight, maxSymbolValue));
++		CHECK_V_F(hSize, HUF_compressWeights_wksp(op + 1, maxDstSize - 1, huffWeight, maxSymbolValue, workspace, workspaceSize));
 +		if ((hSize > 1) & (hSize < maxSymbolValue / 2)) { /* FSE compressed */
 +			op[0] = (BYTE)hSize;
 +			return hSize + 1;
@@ -10205,15 +10218,29 @@
 +	return ((maxSymbolValue + 1) / 2) + 1;
 +}
 +
-+size_t HUF_readCTable(HUF_CElt *CTable, U32 maxSymbolValue, const void *src, size_t srcSize)
++size_t HUF_readCTable_wksp(HUF_CElt *CTable, U32 maxSymbolValue, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
 +{
-+	BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];  /* init not required, even though some static analyzer may complain */
-+	U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for values from 0 to 16 */
++	U32 *rankVal;
++	BYTE *huffWeight;
 +	U32 tableLog = 0;
 +	U32 nbSymbols = 0;
++	size_t readSize;
++	size_t spaceUsed32 = 0;
++
++	rankVal = (U32 *)workspace + spaceUsed32;
++	spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
++	huffWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
++	spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
++
++	if ((spaceUsed32 << 2) > workspaceSize)
++		return ERROR(tableLog_tooLarge);
++	workspace = (U32 *)workspace + spaceUsed32;
++	workspaceSize -= (spaceUsed32 << 2);
 +
 +	/* get symbol weights */
-+	CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize));
++	readSize = HUF_readStats_wksp(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize);
++	if (ERR_isError(readSize))
++		return readSize;
 +
 +	/* check result */
 +	if (tableLog > HUF_TABLELOG_MAX)
@@ -10711,7 +10738,7 @@
 +
 +	/* Write table description header */
 +	{
-+		CHECK_V_F(hSize, HUF_writeCTable(op, dstSize, CTable, maxSymbolValue, huffLog));
++		CHECK_V_F(hSize, HUF_writeCTable_wksp(op, dstSize, CTable, maxSymbolValue, huffLog, workSpace, wkspSize));
 +		/* Check if using the previous table will be beneficial */
 +		if (repeat && *repeat != HUF_repeat_none) {
 +			size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, count, maxSymbolValue);
@@ -10762,10 +10789,10 @@
 +}
 diff --git a/lib/zstd/huf_decompress.c b/lib/zstd/huf_decompress.c
 new file mode 100644
-index 0000000..950c194
+index 0000000..6526482
 --- /dev/null
 +++ b/lib/zstd/huf_decompress.c
-@@ -0,0 +1,920 @@
+@@ -0,0 +1,960 @@
 +/*
 + * Huffman decoder, part of New Generation Entropy library
 + * Copyright (C) 2013-2016, Yann Collet.
@@ -10817,6 +10844,7 @@
 +#include "fse.h"       /* header compression */
 +#include "huf.h"
 +#include <linux/compiler.h>
++#include <linux/kernel.h>
 +#include <linux/string.h> /* memcpy, memset */
 +
 +/* **************************************************************
@@ -10854,20 +10882,32 @@
 +	BYTE nbBits;
 +} HUF_DEltX2; /* single-symbol decoding */
 +
-+size_t HUF_readDTableX2(HUF_DTable *DTable, const void *src, size_t srcSize)
++size_t HUF_readDTableX2_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
 +{
-+	BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
-+	U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for values from 0 to 16 */
 +	U32 tableLog = 0;
 +	U32 nbSymbols = 0;
 +	size_t iSize;
 +	void *const dtPtr = DTable + 1;
 +	HUF_DEltX2 *const dt = (HUF_DEltX2 *)dtPtr;
 +
++	U32 *rankVal;
++	BYTE *huffWeight;
++	size_t spaceUsed32 = 0;
++
++	rankVal = (U32 *)workspace + spaceUsed32;
++	spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
++	huffWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
++	spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
++
++	if ((spaceUsed32 << 2) > workspaceSize)
++		return ERROR(tableLog_tooLarge);
++	workspace = (U32 *)workspace + spaceUsed32;
++	workspaceSize -= (spaceUsed32 << 2);
++
 +	HUF_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
 +	/* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
 +
-+	iSize = HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
++	iSize = HUF_readStats_wksp(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize);
 +	if (HUF_isError(iSize))
 +		return iSize;
 +
@@ -10984,11 +11024,11 @@
 +	return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
 +}
 +
-+size_t HUF_decompress1X2_DCtx(HUF_DTable *DCtx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize)
++size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable *DCtx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
 +{
 +	const BYTE *ip = (const BYTE *)cSrc;
 +
-+	size_t const hSize = HUF_readDTableX2(DCtx, cSrc, cSrcSize);
++	size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, workspace, workspaceSize);
 +	if (HUF_isError(hSize))
 +		return hSize;
 +	if (hSize >= cSrcSize)
@@ -11115,11 +11155,11 @@
 +	return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
 +}
 +
-+size_t HUF_decompress4X2_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize)
++size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
 +{
 +	const BYTE *ip = (const BYTE *)cSrc;
 +
-+	size_t const hSize = HUF_readDTableX2(dctx, cSrc, cSrcSize);
++	size_t const hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, workspace, workspaceSize);
 +	if (HUF_isError(hSize))
 +		return hSize;
 +	if (hSize >= cSrcSize)
@@ -11190,6 +11230,7 @@
 +}
 +
 +typedef U32 rankVal_t[HUF_TABLELOG_MAX][HUF_TABLELOG_MAX + 1];
++typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
 +
 +static void HUF_fillDTableX4(HUF_DEltX4 *DTable, const U32 targetLog, const sortedSymbol_t *sortedList, const U32 sortedListSize, const U32 *rankStart,
 +			     rankVal_t rankValOrigin, const U32 maxWeight, const U32 nbBitsBaseline)
@@ -11233,27 +11274,50 @@
 +	}
 +}
 +
-+size_t HUF_readDTableX4(HUF_DTable *DTable, const void *src, size_t srcSize)
++size_t HUF_readDTableX4_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
 +{
-+	BYTE weightList[HUF_SYMBOLVALUE_MAX + 1];
-+	sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1];
-+	U32 rankStats[HUF_TABLELOG_MAX + 1] = {0};
-+	U32 rankStart0[HUF_TABLELOG_MAX + 2] = {0};
-+	U32 *const rankStart = rankStart0 + 1;
-+	rankVal_t rankVal;
 +	U32 tableLog, maxW, sizeOfSort, nbSymbols;
 +	DTableDesc dtd = HUF_getDTableDesc(DTable);
 +	U32 const maxTableLog = dtd.maxTableLog;
 +	size_t iSize;
 +	void *dtPtr = DTable + 1; /* force compiler to avoid strict-aliasing */
 +	HUF_DEltX4 *const dt = (HUF_DEltX4 *)dtPtr;
++	U32 *rankStart;
++
++	rankValCol_t *rankVal;
++	U32 *rankStats;
++	U32 *rankStart0;
++	sortedSymbol_t *sortedSymbol;
++	BYTE *weightList;
++	size_t spaceUsed32 = 0;
++
++	HUF_STATIC_ASSERT((sizeof(rankValCol_t) & 3) == 0);
++
++	rankVal = (rankValCol_t *)((U32 *)workspace + spaceUsed32);
++	spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2;
++	rankStats = (U32 *)workspace + spaceUsed32;
++	spaceUsed32 += HUF_TABLELOG_MAX + 1;
++	rankStart0 = (U32 *)workspace + spaceUsed32;
++	spaceUsed32 += HUF_TABLELOG_MAX + 2;
++	sortedSymbol = (sortedSymbol_t *)((U32 *)workspace + spaceUsed32);
++	spaceUsed32 += ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2;
++	weightList = (BYTE *)((U32 *)workspace + spaceUsed32);
++	spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
++
++	if ((spaceUsed32 << 2) > workspaceSize)
++		return ERROR(tableLog_tooLarge);
++	workspace = (U32 *)workspace + spaceUsed32;
++	workspaceSize -= (spaceUsed32 << 2);
++
++	rankStart = rankStart0 + 1;
++	memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
 +
 +	HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */
 +	if (maxTableLog > HUF_TABLELOG_MAX)
 +		return ERROR(tableLog_tooLarge);
 +	/* memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
 +
-+	iSize = HUF_readStats(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
++	iSize = HUF_readStats_wksp(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize);
 +	if (HUF_isError(iSize))
 +		return iSize;
 +
@@ -11420,11 +11484,11 @@
 +	return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
 +}
 +
-+size_t HUF_decompress1X4_DCtx(HUF_DTable *DCtx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize)
++size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable *DCtx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
 +{
 +	const BYTE *ip = (const BYTE *)cSrc;
 +
-+	size_t const hSize = HUF_readDTableX4(DCtx, cSrc, cSrcSize);
++	size_t const hSize = HUF_readDTableX4_wksp(DCtx, cSrc, cSrcSize, workspace, workspaceSize);
 +	if (HUF_isError(hSize))
 +		return hSize;
 +	if (hSize >= cSrcSize)
@@ -11553,11 +11617,11 @@
 +	return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
 +}
 +
-+size_t HUF_decompress4X4_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize)
++size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
 +{
 +	const BYTE *ip = (const BYTE *)cSrc;
 +
-+	size_t hSize = HUF_readDTableX4(dctx, cSrc, cSrcSize);
++	size_t hSize = HUF_readDTableX4_wksp(dctx, cSrc, cSrcSize, workspace, workspaceSize);
 +	if (HUF_isError(hSize))
 +		return hSize;
 +	if (hSize >= cSrcSize)
@@ -11629,7 +11693,7 @@
 +
 +typedef size_t (*decompressionAlgo)(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize);
 +
-+size_t HUF_decompress4X_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize)
++size_t HUF_decompress4X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
 +{
 +	/* validation checks */
 +	if (dstSize == 0)
@@ -11647,11 +11711,12 @@
 +
 +	{
 +		U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-+		return algoNb ? HUF_decompress4X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) : HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
++		return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize)
++			      : HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize);
 +	}
 +}
 +
-+size_t HUF_decompress4X_hufOnly(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize)
++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
 +{
 +	/* validation checks */
 +	if (dstSize == 0)
@@ -11661,11 +11726,12 @@
 +
 +	{
 +		U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-+		return algoNb ? HUF_decompress4X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) : HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
++		return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize)
++			      : HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize);
 +	}
 +}
 +
-+size_t HUF_decompress1X_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize)
++size_t HUF_decompress1X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
 +{
 +	/* validation checks */
 +	if (dstSize == 0)
@@ -11683,7 +11749,8 @@
 +
 +	{
 +		U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-+		return algoNb ? HUF_decompress1X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) : HUF_decompress1X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
++		return algoNb ? HUF_decompress1X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize)
++			      : HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize);
 +	}
 +}
 diff --git a/lib/zstd/mem.h b/lib/zstd/mem.h
@@ -11845,7 +11912,7 @@
 +#endif /* MEM_H_MODULE */
 diff --git a/lib/zstd/zstd_common.c b/lib/zstd/zstd_common.c
 new file mode 100644
-index 0000000..6ebf68d
+index 0000000..a282624
 --- /dev/null
 +++ b/lib/zstd/zstd_common.c
 @@ -0,0 +1,75 @@
@@ -11902,7 +11969,7 @@
 +void *ZSTD_stackAllocAll(void *opaque, size_t *size)
 +{
 +	ZSTD_stack *stack = (ZSTD_stack *)opaque;
-+	*size = stack->end - ZSTD_PTR_ALIGN(stack->ptr);
++	*size = (BYTE const *)stack->end - (BYTE *)ZSTD_PTR_ALIGN(stack->ptr);
 +	return stack_push(stack, *size);
 +}
 +
diff --git a/contrib/linux-kernel/0003-btrfs-Add-zstd-support.patch b/contrib/linux-kernel/0003-btrfs-Add-zstd-support.patch
index 53d03d3..abc8326 100644
--- a/contrib/linux-kernel/0003-btrfs-Add-zstd-support.patch
+++ b/contrib/linux-kernel/0003-btrfs-Add-zstd-support.patch
@@ -1,7 +1,7 @@
 From 599f8f2aaace3df939cb145368574a52268d82d0 Mon Sep 17 00:00:00 2001
 From: Nick Terrell <terrelln@fb.com>
 Date: Wed, 21 Jun 2017 17:31:39 -0700
-Subject: [PATCH 3/4] btrfs: Add zstd support
+Subject: [PATCH v2 3/4] btrfs: Add zstd support
 
 Add zstd compression and decompression support to BtrFS. zstd at its
 fastest level compresses almost as well as zlib, while offering much
diff --git a/contrib/linux-kernel/0004-squashfs-Add-zstd-support.patch b/contrib/linux-kernel/0004-squashfs-Add-zstd-support.patch
index e9c4b98..b638194 100644
--- a/contrib/linux-kernel/0004-squashfs-Add-zstd-support.patch
+++ b/contrib/linux-kernel/0004-squashfs-Add-zstd-support.patch
@@ -1,7 +1,7 @@
 From 5ff6a64abaea7b7f11d37cb0fdf08642316a3a90 Mon Sep 17 00:00:00 2001
 From: Nick Terrell <terrelln@fb.com>
 Date: Mon, 12 Jun 2017 12:18:23 -0700
-Subject: [PATCH 4/4] squashfs: Add zstd support
+Subject: [PATCH v2 4/4] squashfs: Add zstd support
 
 Add zstd compression and decompression support to SquashFS. zstd is a
 great fit for SquashFS because it can compress at ratios approaching xz,
diff --git a/contrib/linux-kernel/lib/xxhash.c b/contrib/linux-kernel/lib/xxhash.c
index dc94904..aa61e2a 100644
--- a/contrib/linux-kernel/lib/xxhash.c
+++ b/contrib/linux-kernel/lib/xxhash.c
@@ -391,7 +391,7 @@
 	}
 
 	if (state->memsize) { /* tmp buffer is full */
-		const uint64_t *p64 = state->mem64;
+		uint64_t *p64 = state->mem64;
 
 		memcpy(((uint8_t *)p64) + state->memsize, input,
 			32 - state->memsize);
diff --git a/contrib/linux-kernel/lib/zstd/compress.c b/contrib/linux-kernel/lib/zstd/compress.c
index 1aff542..d60ab7d 100644
--- a/contrib/linux-kernel/lib/zstd/compress.c
+++ b/contrib/linux-kernel/lib/zstd/compress.c
@@ -84,7 +84,7 @@
 	FSE_CTable offcodeCTable[FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
 	FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
 	FSE_CTable litlengthCTable[FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
-	unsigned tmpCounters[HUF_WORKSPACE_SIZE_U32];
+	unsigned tmpCounters[HUF_COMPRESS_WORKSPACE_SIZE_U32];
 };
 
 size_t ZSTD_CCtxWorkspaceBound(ZSTD_compressionParameters cParams)
@@ -587,8 +587,6 @@
 {
 	const int longOffsets = zc->params.cParams.windowLog > STREAM_ACCUMULATOR_MIN;
 	const seqStore_t *seqStorePtr = &(zc->seqStore);
-	U32 count[MaxSeq + 1];
-	S16 norm[MaxSeq + 1];
 	FSE_CTable *CTable_LitLength = zc->litlengthCTable;
 	FSE_CTable *CTable_OffsetBits = zc->offcodeCTable;
 	FSE_CTable *CTable_MatchLength = zc->matchlengthCTable;
@@ -602,7 +600,21 @@
 	BYTE *op = ostart;
 	size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
 	BYTE *seqHead;
-	BYTE scratchBuffer[1 << MAX(MLFSELog, LLFSELog)];
+
+	U32 *count;
+	S16 *norm;
+	U32 *workspace;
+	size_t workspaceSize = sizeof(zc->tmpCounters);
+	{
+		size_t spaceUsed32 = 0;
+		count = (U32 *)zc->tmpCounters + spaceUsed32;
+		spaceUsed32 += MaxSeq + 1;
+		norm = (S16 *)((U32 *)zc->tmpCounters + spaceUsed32);
+		spaceUsed32 += ALIGN(sizeof(S16) * (MaxSeq + 1), sizeof(U32)) >> 2;
+
+		workspace = (U32 *)zc->tmpCounters + spaceUsed32;
+		workspaceSize -= (spaceUsed32 << 2);
+	}
 
 	/* Compress literals */
 	{
@@ -638,7 +650,7 @@
 	/* CTable for Literal Lengths */
 	{
 		U32 max = MaxLL;
-		size_t const mostFrequent = FSE_countFast_wksp(count, &max, llCodeTable, nbSeq, zc->tmpCounters);
+		size_t const mostFrequent = FSE_countFast_wksp(count, &max, llCodeTable, nbSeq, workspace);
 		if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
 			*op++ = llCodeTable[0];
 			FSE_buildCTable_rle(CTable_LitLength, (BYTE)max);
@@ -646,7 +658,7 @@
 		} else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
 			LLtype = set_repeat;
 		} else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (LL_defaultNormLog - 1)))) {
-			FSE_buildCTable_wksp(CTable_LitLength, LL_defaultNorm, MaxLL, LL_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
+			FSE_buildCTable_wksp(CTable_LitLength, LL_defaultNorm, MaxLL, LL_defaultNormLog, workspace, workspaceSize);
 			LLtype = set_basic;
 		} else {
 			size_t nbSeq_1 = nbSeq;
@@ -662,7 +674,7 @@
 					return NCountSize;
 				op += NCountSize;
 			}
-			FSE_buildCTable_wksp(CTable_LitLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
+			FSE_buildCTable_wksp(CTable_LitLength, norm, max, tableLog, workspace, workspaceSize);
 			LLtype = set_compressed;
 		}
 	}
@@ -670,7 +682,7 @@
 	/* CTable for Offsets */
 	{
 		U32 max = MaxOff;
-		size_t const mostFrequent = FSE_countFast_wksp(count, &max, ofCodeTable, nbSeq, zc->tmpCounters);
+		size_t const mostFrequent = FSE_countFast_wksp(count, &max, ofCodeTable, nbSeq, workspace);
 		if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
 			*op++ = ofCodeTable[0];
 			FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max);
@@ -678,7 +690,7 @@
 		} else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
 			Offtype = set_repeat;
 		} else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (OF_defaultNormLog - 1)))) {
-			FSE_buildCTable_wksp(CTable_OffsetBits, OF_defaultNorm, MaxOff, OF_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
+			FSE_buildCTable_wksp(CTable_OffsetBits, OF_defaultNorm, MaxOff, OF_defaultNormLog, workspace, workspaceSize);
 			Offtype = set_basic;
 		} else {
 			size_t nbSeq_1 = nbSeq;
@@ -694,7 +706,7 @@
 					return NCountSize;
 				op += NCountSize;
 			}
-			FSE_buildCTable_wksp(CTable_OffsetBits, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
+			FSE_buildCTable_wksp(CTable_OffsetBits, norm, max, tableLog, workspace, workspaceSize);
 			Offtype = set_compressed;
 		}
 	}
@@ -702,7 +714,7 @@
 	/* CTable for MatchLengths */
 	{
 		U32 max = MaxML;
-		size_t const mostFrequent = FSE_countFast_wksp(count, &max, mlCodeTable, nbSeq, zc->tmpCounters);
+		size_t const mostFrequent = FSE_countFast_wksp(count, &max, mlCodeTable, nbSeq, workspace);
 		if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
 			*op++ = *mlCodeTable;
 			FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max);
@@ -710,7 +722,7 @@
 		} else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
 			MLtype = set_repeat;
 		} else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (ML_defaultNormLog - 1)))) {
-			FSE_buildCTable_wksp(CTable_MatchLength, ML_defaultNorm, MaxML, ML_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
+			FSE_buildCTable_wksp(CTable_MatchLength, ML_defaultNorm, MaxML, ML_defaultNormLog, workspace, workspaceSize);
 			MLtype = set_basic;
 		} else {
 			size_t nbSeq_1 = nbSeq;
@@ -726,7 +738,7 @@
 					return NCountSize;
 				op += NCountSize;
 			}
-			FSE_buildCTable_wksp(CTable_MatchLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
+			FSE_buildCTable_wksp(CTable_MatchLength, norm, max, tableLog, workspace, workspaceSize);
 			MLtype = set_compressed;
 		}
 	}
@@ -2612,14 +2624,13 @@
 	const BYTE *const dictEnd = dictPtr + dictSize;
 	short offcodeNCount[MaxOff + 1];
 	unsigned offcodeMaxValue = MaxOff;
-	BYTE scratchBuffer[1 << MAX(MLFSELog, LLFSELog)];
 
 	dictPtr += 4; /* skip magic number */
 	cctx->dictID = cctx->params.fParams.noDictIDFlag ? 0 : ZSTD_readLE32(dictPtr);
 	dictPtr += 4;
 
 	{
-		size_t const hufHeaderSize = HUF_readCTable(cctx->hufTable, 255, dictPtr, dictEnd - dictPtr);
+		size_t const hufHeaderSize = HUF_readCTable_wksp(cctx->hufTable, 255, dictPtr, dictEnd - dictPtr, cctx->tmpCounters, sizeof(cctx->tmpCounters));
 		if (HUF_isError(hufHeaderSize))
 			return ERROR(dictionary_corrupted);
 		dictPtr += hufHeaderSize;
@@ -2633,7 +2644,7 @@
 		if (offcodeLog > OffFSELog)
 			return ERROR(dictionary_corrupted);
 		/* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
-		CHECK_E(FSE_buildCTable_wksp(cctx->offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog, scratchBuffer, sizeof(scratchBuffer)),
+		CHECK_E(FSE_buildCTable_wksp(cctx->offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog, cctx->tmpCounters, sizeof(cctx->tmpCounters)),
 			dictionary_corrupted);
 		dictPtr += offcodeHeaderSize;
 	}
@@ -2649,7 +2660,7 @@
 		/* Every match length code must have non-zero probability */
 		CHECK_F(ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML));
 		CHECK_E(
-		    FSE_buildCTable_wksp(cctx->matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, scratchBuffer, sizeof(scratchBuffer)),
+		    FSE_buildCTable_wksp(cctx->matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, cctx->tmpCounters, sizeof(cctx->tmpCounters)),
 		    dictionary_corrupted);
 		dictPtr += matchlengthHeaderSize;
 	}
@@ -2664,7 +2675,7 @@
 			return ERROR(dictionary_corrupted);
 		/* Every literal length code must have non-zero probability */
 		CHECK_F(ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL));
-		CHECK_E(FSE_buildCTable_wksp(cctx->litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, scratchBuffer, sizeof(scratchBuffer)),
+		CHECK_E(FSE_buildCTable_wksp(cctx->litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, cctx->tmpCounters, sizeof(cctx->tmpCounters)),
 			dictionary_corrupted);
 		dictPtr += litlengthHeaderSize;
 	}
diff --git a/contrib/linux-kernel/lib/zstd/decompress.c b/contrib/linux-kernel/lib/zstd/decompress.c
index ec673d7..62449ae 100644
--- a/contrib/linux-kernel/lib/zstd/decompress.c
+++ b/contrib/linux-kernel/lib/zstd/decompress.c
@@ -70,6 +70,7 @@
 	FSE_DTable OFTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
 	FSE_DTable MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
 	HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */
+	U64 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32 / 2];
 	U32 rep[ZSTD_REP_NUM];
 } ZSTD_entropyTables_t;
 
@@ -483,8 +484,10 @@
 					    ? (singleStream ? HUF_decompress1X_usingDTable(dctx->litBuffer, litSize, istart + lhSize, litCSize, dctx->HUFptr)
 							    : HUF_decompress4X_usingDTable(dctx->litBuffer, litSize, istart + lhSize, litCSize, dctx->HUFptr))
 					    : (singleStream
-						   ? HUF_decompress1X2_DCtx(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart + lhSize, litCSize)
-						   : HUF_decompress4X_hufOnly(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart + lhSize, litCSize))))
+						   ? HUF_decompress1X2_DCtx_wksp(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart + lhSize, litCSize,
+										 dctx->entropy.workspace, sizeof(dctx->entropy.workspace))
+						   : HUF_decompress4X_hufOnly_wksp(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart + lhSize, litCSize,
+										   dctx->entropy.workspace, sizeof(dctx->entropy.workspace)))))
 					return ERROR(corruption_detected);
 
 				dctx->litPtr = dctx->litBuffer;
@@ -747,7 +750,7 @@
 			  or an error code if it fails, testable with ZSTD_isError()
 */
 static size_t ZSTD_buildSeqTable(FSE_DTable *DTableSpace, const FSE_DTable **DTablePtr, symbolEncodingType_e type, U32 max, U32 maxLog, const void *src,
-				 size_t srcSize, const FSE_decode_t4 *defaultTable, U32 flagRepeatTable)
+				 size_t srcSize, const FSE_decode_t4 *defaultTable, U32 flagRepeatTable, void *workspace, size_t workspaceSize)
 {
 	const void *const tmpPtr = defaultTable; /* bypass strict aliasing */
 	switch (type) {
@@ -767,15 +770,23 @@
 	default: /* impossible */
 	case set_compressed: {
 		U32 tableLog;
-		S16 norm[MaxSeq + 1];
-		size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
-		if (FSE_isError(headerSize))
-			return ERROR(corruption_detected);
-		if (tableLog > maxLog)
-			return ERROR(corruption_detected);
-		FSE_buildDTable(DTableSpace, norm, max, tableLog);
-		*DTablePtr = DTableSpace;
-		return headerSize;
+		S16 *norm = (S16 *)workspace;
+		size_t const spaceUsed32 = ALIGN(sizeof(S16) * (MaxSeq + 1), sizeof(U32)) >> 2;
+
+		if ((spaceUsed32 << 2) > workspaceSize)
+			return ERROR(GENERIC);
+		workspace = (U32 *)workspace + spaceUsed32;
+		workspaceSize -= (spaceUsed32 << 2);
+		{
+			size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
+			if (FSE_isError(headerSize))
+				return ERROR(corruption_detected);
+			if (tableLog > maxLog)
+				return ERROR(corruption_detected);
+			FSE_buildDTable_wksp(DTableSpace, norm, max, tableLog, workspace, workspaceSize);
+			*DTablePtr = DTableSpace;
+			return headerSize;
+		}
 	}
 	}
 }
@@ -823,21 +834,21 @@
 		/* Build DTables */
 		{
 			size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr, LLtype, MaxLL, LLFSELog, ip, iend - ip,
-								  LL_defaultDTable, dctx->fseEntropy);
+								  LL_defaultDTable, dctx->fseEntropy, dctx->entropy.workspace, sizeof(dctx->entropy.workspace));
 			if (ZSTD_isError(llhSize))
 				return ERROR(corruption_detected);
 			ip += llhSize;
 		}
 		{
 			size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr, OFtype, MaxOff, OffFSELog, ip, iend - ip,
-								  OF_defaultDTable, dctx->fseEntropy);
+								  OF_defaultDTable, dctx->fseEntropy, dctx->entropy.workspace, sizeof(dctx->entropy.workspace));
 			if (ZSTD_isError(ofhSize))
 				return ERROR(corruption_detected);
 			ip += ofhSize;
 		}
 		{
 			size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr, MLtype, MaxML, MLFSELog, ip, iend - ip,
-								  ML_defaultDTable, dctx->fseEntropy);
+								  ML_defaultDTable, dctx->fseEntropy, dctx->entropy.workspace, sizeof(dctx->entropy.workspace));
 			if (ZSTD_isError(mlhSize))
 				return ERROR(corruption_detected);
 			ip += mlhSize;
@@ -1360,10 +1371,11 @@
 #define STORED_SEQS 4
 #define STOSEQ_MASK (STORED_SEQS - 1)
 #define ADVANCED_SEQS 4
-		seq_t sequences[STORED_SEQS];
+		seq_t *sequences = (seq_t *)dctx->entropy.workspace;
 		int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
 		seqState_t seqState;
 		int seqNb;
+		ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.workspace) >= sizeof(seq_t) * STORED_SEQS);
 		dctx->fseEntropy = 1;
 		{
 			U32 i;
@@ -1866,7 +1878,7 @@
 	dictPtr += 8; /* skip header = magic + dictID */
 
 	{
-		size_t const hSize = HUF_readDTableX4(entropy->hufTable, dictPtr, dictEnd - dictPtr);
+		size_t const hSize = HUF_readDTableX4_wksp(entropy->hufTable, dictPtr, dictEnd - dictPtr, entropy->workspace, sizeof(entropy->workspace));
 		if (HUF_isError(hSize))
 			return ERROR(dictionary_corrupted);
 		dictPtr += hSize;
@@ -1880,7 +1892,7 @@
 			return ERROR(dictionary_corrupted);
 		if (offcodeLog > OffFSELog)
 			return ERROR(dictionary_corrupted);
-		CHECK_E(FSE_buildDTable(entropy->OFTable, offcodeNCount, offcodeMaxValue, offcodeLog), dictionary_corrupted);
+		CHECK_E(FSE_buildDTable_wksp(entropy->OFTable, offcodeNCount, offcodeMaxValue, offcodeLog, entropy->workspace, sizeof(entropy->workspace)), dictionary_corrupted);
 		dictPtr += offcodeHeaderSize;
 	}
 
@@ -1892,7 +1904,7 @@
 			return ERROR(dictionary_corrupted);
 		if (matchlengthLog > MLFSELog)
 			return ERROR(dictionary_corrupted);
-		CHECK_E(FSE_buildDTable(entropy->MLTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog), dictionary_corrupted);
+		CHECK_E(FSE_buildDTable_wksp(entropy->MLTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, entropy->workspace, sizeof(entropy->workspace)), dictionary_corrupted);
 		dictPtr += matchlengthHeaderSize;
 	}
 
@@ -1904,7 +1916,7 @@
 			return ERROR(dictionary_corrupted);
 		if (litlengthLog > LLFSELog)
 			return ERROR(dictionary_corrupted);
-		CHECK_E(FSE_buildDTable(entropy->LLTable, litlengthNCount, litlengthMaxValue, litlengthLog), dictionary_corrupted);
+		CHECK_E(FSE_buildDTable_wksp(entropy->LLTable, litlengthNCount, litlengthMaxValue, litlengthLog, entropy->workspace, sizeof(entropy->workspace)), dictionary_corrupted);
 		dictPtr += litlengthHeaderSize;
 	}
 
diff --git a/contrib/linux-kernel/lib/zstd/entropy_common.c b/contrib/linux-kernel/lib/zstd/entropy_common.c
index b354fc2..2b0a643 100644
--- a/contrib/linux-kernel/lib/zstd/entropy_common.c
+++ b/contrib/linux-kernel/lib/zstd/entropy_common.c
@@ -164,7 +164,7 @@
 	@return : size read from `src` , or an error Code .
 	Note : Needed by HUF_readCTable() and HUF_readDTableX?() .
 */
-size_t HUF_readStats(BYTE *huffWeight, size_t hwSize, U32 *rankStats, U32 *nbSymbolsPtr, U32 *tableLogPtr, const void *src, size_t srcSize)
+size_t HUF_readStats_wksp(BYTE *huffWeight, size_t hwSize, U32 *rankStats, U32 *nbSymbolsPtr, U32 *tableLogPtr, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
 {
 	U32 weightTotal;
 	const BYTE *ip = (const BYTE *)src;
@@ -192,10 +192,9 @@
 			}
 		}
 	} else {						 /* header compressed with FSE (normal case) */
-		FSE_DTable fseWorkspace[FSE_DTABLE_SIZE_U32(6)]; /* 6 is max possible tableLog for HUF header (maybe even 5, to be tested) */
 		if (iSize + 1 > srcSize)
 			return ERROR(srcSize_wrong);
-		oSize = FSE_decompress_wksp(huffWeight, hwSize - 1, ip + 1, iSize, fseWorkspace, 6); /* max (hwSize-1) values decoded, as last one is implied */
+		oSize = FSE_decompress_wksp(huffWeight, hwSize - 1, ip + 1, iSize, 6, workspace, workspaceSize); /* max (hwSize-1) values decoded, as last one is implied */
 		if (FSE_isError(oSize))
 			return oSize;
 	}
diff --git a/contrib/linux-kernel/lib/zstd/fse.h b/contrib/linux-kernel/lib/zstd/fse.h
index bc2962a..7460ab0 100644
--- a/contrib/linux-kernel/lib/zstd/fse.h
+++ b/contrib/linux-kernel/lib/zstd/fse.h
@@ -187,7 +187,7 @@
 /*! FSE_buildDTable():
 	Builds 'dt', which must be already allocated, using FSE_createDTable().
 	return : 0, or an errorCode, which can be tested using FSE_isError() */
-FSE_PUBLIC_API size_t FSE_buildDTable(FSE_DTable *dt, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable *dt, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize);
 
 /*! FSE_decompress_usingDTable():
 	Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
@@ -263,15 +263,6 @@
 unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
 /**< same as FSE_optimalTableLog(), which used `minus==2` */
 
-/* FSE_compress_wksp() :
- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
- * FSE_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
- */
-#define FSE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) \
-	(FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024))
-size_t FSE_compress_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
-			 size_t wkspSize);
-
 size_t FSE_buildCTable_raw(FSE_CTable *ct, unsigned nbBits);
 /**< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
 
@@ -290,7 +281,7 @@
 size_t FSE_buildDTable_rle(FSE_DTable *dt, unsigned char symbolValue);
 /**< build a fake FSE_DTable, designed to always generate the same symbolValue */
 
-size_t FSE_decompress_wksp(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, FSE_DTable *workSpace, unsigned maxLog);
+size_t FSE_decompress_wksp(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, unsigned maxLog, void *workspace, size_t workspaceSize);
 /**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DTABLE_SIZE_U32(maxLog)` */
 
 /* *****************************************
diff --git a/contrib/linux-kernel/lib/zstd/fse_compress.c b/contrib/linux-kernel/lib/zstd/fse_compress.c
index e016bb1..ef3d174 100644
--- a/contrib/linux-kernel/lib/zstd/fse_compress.c
+++ b/contrib/linux-kernel/lib/zstd/fse_compress.c
@@ -48,6 +48,8 @@
 #include "bitstream.h"
 #include "fse.h"
 #include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/math64.h>
 #include <linux/string.h> /* memcpy, memset */
 
 /* **************************************************************
@@ -87,7 +89,7 @@
  * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
  * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
  */
-size_t FSE_buildCTable_wksp(FSE_CTable *ct, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workSpace, size_t wkspSize)
+size_t FSE_buildCTable_wksp(FSE_CTable *ct, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize)
 {
 	U32 const tableSize = 1 << tableLog;
 	U32 const tableMask = tableSize - 1;
@@ -96,14 +98,23 @@
 	void *const FSCT = ((U32 *)ptr) + 1 /* header */ + (tableLog ? tableSize >> 1 : 1);
 	FSE_symbolCompressionTransform *const symbolTT = (FSE_symbolCompressionTransform *)(FSCT);
 	U32 const step = FSE_TABLESTEP(tableSize);
-	U32 cumul[FSE_MAX_SYMBOL_VALUE + 2];
-
-	FSE_FUNCTION_TYPE *const tableSymbol = (FSE_FUNCTION_TYPE *)workSpace;
 	U32 highThreshold = tableSize - 1;
 
-	/* CTable header */
-	if (((size_t)1 << tableLog) * sizeof(FSE_FUNCTION_TYPE) > wkspSize)
+	U32 *cumul;
+	FSE_FUNCTION_TYPE *tableSymbol;
+	size_t spaceUsed32 = 0;
+
+	cumul = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += FSE_MAX_SYMBOL_VALUE + 2;
+	tableSymbol = (FSE_FUNCTION_TYPE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(sizeof(FSE_FUNCTION_TYPE) * ((size_t)1 << tableLog), sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
 		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
+	/* CTable header */
 	tableU16[-2] = (U16)tableLog;
 	tableU16[-1] = (U16)maxSymbolValue;
 
@@ -575,7 +586,7 @@
 	{
 		U64 const vStepLog = 62 - tableLog;
 		U64 const mid = (1ULL << (vStepLog - 1)) - 1;
-		U64 const rStep = ((((U64)1 << vStepLog) * ToDistribute) + mid) / total; /* scale on remaining */
+		U64 const rStep = div_u64((((U64)1 << vStepLog) * ToDistribute) + mid, (U32)total); /* scale on remaining */
 		U64 tmpTotal = mid;
 		for (s = 0; s <= maxSymbolValue; s++) {
 			if (norm[s] == NOT_YET_ASSIGNED) {
@@ -609,7 +620,7 @@
 	{
 		U32 const rtbTable[] = {0, 473195, 504333, 520860, 550000, 700000, 750000, 830000};
 		U64 const scale = 62 - tableLog;
-		U64 const step = ((U64)1 << 62) / total; /* <== here, one division ! */
+		U64 const step = div_u64((U64)1 << 62, (U32)total); /* <== here, one division ! */
 		U64 const vStep = 1ULL << (scale - 20);
 		int stillToDistribute = 1 << tableLog;
 		unsigned s;
@@ -782,76 +793,3 @@
 }
 
 size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
-
-#define CHECK_V_F(e, f)     \
-	size_t const e = f; \
-	if (ERR_isError(e)) \
-	return f
-#define CHECK_F(f)                        \
-	{                                 \
-		CHECK_V_F(_var_err__, f); \
-	}
-
-/* FSE_compress_wksp() :
- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
- * `wkspSize` size must be `(1<<tableLog)`.
- */
-size_t FSE_compress_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
-			 size_t wkspSize)
-{
-	BYTE *const ostart = (BYTE *)dst;
-	BYTE *op = ostart;
-	BYTE *const oend = ostart + dstSize;
-
-	U32 count[FSE_MAX_SYMBOL_VALUE + 1];
-	S16 norm[FSE_MAX_SYMBOL_VALUE + 1];
-	FSE_CTable *CTable = (FSE_CTable *)workSpace;
-	size_t const CTableSize = FSE_CTABLE_SIZE_U32(tableLog, maxSymbolValue);
-	void *scratchBuffer = (void *)(CTable + CTableSize);
-	size_t const scratchBufferSize = wkspSize - (CTableSize * sizeof(FSE_CTable));
-
-	/* init conditions */
-	if (wkspSize < FSE_WKSP_SIZE_U32(tableLog, maxSymbolValue))
-		return ERROR(tableLog_tooLarge);
-	if (srcSize <= 1)
-		return 0; /* Not compressible */
-	if (!maxSymbolValue)
-		maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
-	if (!tableLog)
-		tableLog = FSE_DEFAULT_TABLELOG;
-
-	/* Scan input and build symbol stats */
-	{
-		CHECK_V_F(maxCount, FSE_count_wksp(count, &maxSymbolValue, src, srcSize, (unsigned *)scratchBuffer));
-		if (maxCount == srcSize)
-			return 1; /* only a single symbol in src : rle */
-		if (maxCount == 1)
-			return 0; /* each symbol present maximum once => not compressible */
-		if (maxCount < (srcSize >> 7))
-			return 0; /* Heuristic : not compressible enough */
-	}
-
-	tableLog = FSE_optimalTableLog(tableLog, srcSize, maxSymbolValue);
-	CHECK_F(FSE_normalizeCount(norm, tableLog, count, srcSize, maxSymbolValue));
-
-	/* Write table description header */
-	{
-		CHECK_V_F(nc_err, FSE_writeNCount(op, oend - op, norm, maxSymbolValue, tableLog));
-		op += nc_err;
-	}
-
-	/* Compress */
-	CHECK_F(FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, scratchBufferSize));
-	{
-		CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, src, srcSize, CTable));
-		if (cSize == 0)
-			return 0; /* not enough space for compressed data */
-		op += cSize;
-	}
-
-	/* check compressibility */
-	if ((size_t)(op - ostart) >= srcSize - 1)
-		return 0;
-
-	return op - ostart;
-}
diff --git a/contrib/linux-kernel/lib/zstd/fse_decompress.c b/contrib/linux-kernel/lib/zstd/fse_decompress.c
index 96cf89f..a84300e 100644
--- a/contrib/linux-kernel/lib/zstd/fse_decompress.c
+++ b/contrib/linux-kernel/lib/zstd/fse_decompress.c
@@ -48,6 +48,7 @@
 #include "bitstream.h"
 #include "fse.h"
 #include <linux/compiler.h>
+#include <linux/kernel.h>
 #include <linux/string.h> /* memcpy, memset */
 
 /* **************************************************************
@@ -91,17 +92,19 @@
 
 /* Function templates */
 
-size_t FSE_buildDTable(FSE_DTable *dt, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+size_t FSE_buildDTable_wksp(FSE_DTable *dt, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize)
 {
 	void *const tdPtr = dt + 1; /* because *dt is unsigned, 32-bits aligned on 32-bits */
 	FSE_DECODE_TYPE *const tableDecode = (FSE_DECODE_TYPE *)(tdPtr);
-	U16 symbolNext[FSE_MAX_SYMBOL_VALUE + 1];
+	U16 *symbolNext = (U16 *)workspace;
 
 	U32 const maxSV1 = maxSymbolValue + 1;
 	U32 const tableSize = 1 << tableLog;
 	U32 highThreshold = tableSize - 1;
 
 	/* Sanity Checks */
+	if (workspaceSize < sizeof(U16) * (FSE_MAX_SYMBOL_VALUE + 1))
+		return ERROR(tableLog_tooLarge);
 	if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE)
 		return ERROR(maxSymbolValue_tooLarge);
 	if (tableLog > FSE_MAX_TABLELOG)
@@ -288,16 +291,32 @@
 	return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
 }
 
-size_t FSE_decompress_wksp(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, FSE_DTable *workSpace, unsigned maxLog)
+size_t FSE_decompress_wksp(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, unsigned maxLog, void *workspace, size_t workspaceSize)
 {
 	const BYTE *const istart = (const BYTE *)cSrc;
 	const BYTE *ip = istart;
-	short counting[FSE_MAX_SYMBOL_VALUE + 1];
 	unsigned tableLog;
 	unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+	size_t NCountLength;
+
+	FSE_DTable *dt;
+	short *counting;
+	size_t spaceUsed32 = 0;
+
+	FSE_STATIC_ASSERT(sizeof(FSE_DTable) == sizeof(U32));
+
+	dt = (FSE_DTable *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += FSE_DTABLE_SIZE_U32(maxLog);
+	counting = (short *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(sizeof(short) * (FSE_MAX_SYMBOL_VALUE + 1), sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
 
 	/* normal FSE decoding mode */
-	size_t const NCountLength = FSE_readNCount(counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+	NCountLength = FSE_readNCount(counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
 	if (FSE_isError(NCountLength))
 		return NCountLength;
 	// if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size; supposed to be already checked in NCountLength, only remaining
@@ -307,7 +326,7 @@
 	ip += NCountLength;
 	cSrcSize -= NCountLength;
 
-	CHECK_F(FSE_buildDTable(workSpace, counting, maxSymbolValue, tableLog));
+	CHECK_F(FSE_buildDTable_wksp(dt, counting, maxSymbolValue, tableLog, workspace, workspaceSize));
 
-	return FSE_decompress_usingDTable(dst, dstCapacity, ip, cSrcSize, workSpace); /* always return, even if it is an error code */
+	return FSE_decompress_usingDTable(dst, dstCapacity, ip, cSrcSize, dt); /* always return, even if it is an error code */
 }
diff --git a/contrib/linux-kernel/lib/zstd/huf.h b/contrib/linux-kernel/lib/zstd/huf.h
index 56abe2f..2143da2 100644
--- a/contrib/linux-kernel/lib/zstd/huf.h
+++ b/contrib/linux-kernel/lib/zstd/huf.h
@@ -55,7 +55,7 @@
 /** HUF_compress4X_wksp() :
 *   Same as HUF_compress2(), but uses externally allocated `workSpace`, which must be a table of >= 1024 unsigned */
 size_t HUF_compress4X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
-			   size_t wkspSize); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+			   size_t wkspSize); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
 
 /* *** Dependencies *** */
 #include "mem.h" /* U32 */
@@ -91,17 +91,23 @@
 #define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = {((U32)(maxTableLog)*0x01000001)}
 
 /* The workspace must have alignment at least 4 and be at least this large */
-#define HUF_WORKSPACE_SIZE (6 << 10)
-#define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32))
+#define HUF_COMPRESS_WORKSPACE_SIZE (6 << 10)
+#define HUF_COMPRESS_WORKSPACE_SIZE_U32 (HUF_COMPRESS_WORKSPACE_SIZE / sizeof(U32))
+
+/* The workspace must have alignment at least 4 and be at least this large */
+#define HUF_DECOMPRESS_WORKSPACE_SIZE (3 << 10)
+#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
 
 /* ****************************************
 *  Advanced decompression functions
 ******************************************/
-size_t HUF_decompress4X_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize); /**< decodes RLE and uncompressed */
-size_t HUF_decompress4X_hufOnly(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc,
-				size_t cSrcSize);							       /**< considers RLE and uncompressed as errors */
-size_t HUF_decompress4X2_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize); /**< single-symbol decoder */
-size_t HUF_decompress4X4_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize); /**< double-symbols decoder */
+size_t HUF_decompress4X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize); /**< decodes RLE and uncompressed */
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				size_t workspaceSize);							       /**< considers RLE and uncompressed as errors */
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				   size_t workspaceSize); /**< single-symbol decoder */
+size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				   size_t workspaceSize); /**< double-symbols decoder */
 
 /* ****************************************
 *  HUF detailed API
@@ -111,7 +117,7 @@
 1. count symbol occurrence from source[] into table count[] using FSE_count()
 2. (optional) refine tableLog using HUF_optimalTableLog()
 3. build Huffman table from count using HUF_buildCTable()
-4. save Huffman table to memory buffer using HUF_writeCTable()
+4. save Huffman table to memory buffer using HUF_writeCTable_wksp()
 5. encode the data stream using HUF_compress4X_usingCTable()
 
 The following API allows targeting specific sub-functions for advanced tasks.
@@ -121,7 +127,7 @@
 /* FSE_count() : find it within "fse.h" */
 unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
 typedef struct HUF_CElt_s HUF_CElt; /* incomplete type */
-size_t HUF_writeCTable(void *dst, size_t maxDstSize, const HUF_CElt *CTable, unsigned maxSymbolValue, unsigned huffLog);
+size_t HUF_writeCTable_wksp(void *dst, size_t maxDstSize, const HUF_CElt *CTable, unsigned maxSymbolValue, unsigned huffLog, void *workspace, size_t workspaceSize);
 size_t HUF_compress4X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable);
 
 typedef enum {
@@ -137,7 +143,7 @@
 *   If preferRepeat then the old table will always be used if valid. */
 size_t HUF_compress4X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
 			     size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat,
-			     int preferRepeat); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+			     int preferRepeat); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
 
 /** HUF_buildCTable_wksp() :
  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
@@ -150,11 +156,12 @@
 	`huffWeight` is destination buffer.
 	@return : size read from `src` , or an error Code .
 	Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */
-size_t HUF_readStats(BYTE *huffWeight, size_t hwSize, U32 *rankStats, U32 *nbSymbolsPtr, U32 *tableLogPtr, const void *src, size_t srcSize);
+size_t HUF_readStats_wksp(BYTE *huffWeight, size_t hwSize, U32 *rankStats, U32 *nbSymbolsPtr, U32 *tableLogPtr, const void *src, size_t srcSize,
+			  void *workspace, size_t workspaceSize);
 
 /** HUF_readCTable() :
 *   Loading a CTable saved with HUF_writeCTable() */
-size_t HUF_readCTable(HUF_CElt *CTable, unsigned maxSymbolValue, const void *src, size_t srcSize);
+size_t HUF_readCTable_wksp(HUF_CElt *CTable, unsigned maxSymbolValue, const void *src, size_t srcSize, void *workspace, size_t workspaceSize);
 
 /*
 HUF_decompress() does the following:
@@ -170,8 +177,8 @@
 *   Assumption : 0 < cSrcSize < dstSize <= 128 KB */
 U32 HUF_selectDecoder(size_t dstSize, size_t cSrcSize);
 
-size_t HUF_readDTableX2(HUF_DTable *DTable, const void *src, size_t srcSize);
-size_t HUF_readDTableX4(HUF_DTable *DTable, const void *src, size_t srcSize);
+size_t HUF_readDTableX2_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize);
+size_t HUF_readDTableX4_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize);
 
 size_t HUF_decompress4X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
 size_t HUF_decompress4X2_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
@@ -180,7 +187,7 @@
 /* single stream variants */
 
 size_t HUF_compress1X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
-			   size_t wkspSize); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+			   size_t wkspSize); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
 size_t HUF_compress1X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable);
 /** HUF_compress1X_repeat() :
 *   Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
@@ -189,11 +196,13 @@
 *   If preferRepeat then the old table will always be used if valid. */
 size_t HUF_compress1X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
 			     size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat,
-			     int preferRepeat); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+			     int preferRepeat); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
 
-size_t HUF_decompress1X_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize);
-size_t HUF_decompress1X2_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize); /**< single-symbol decoder */
-size_t HUF_decompress1X4_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize); /**< double-symbols decoder */
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize);
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				   size_t workspaceSize); /**< single-symbol decoder */
+size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				   size_t workspaceSize); /**< double-symbols decoder */
 
 size_t HUF_decompress1X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize,
 				    const HUF_DTable *DTable); /**< automatic selection of sing or double symbol decoder, based on DTable */
diff --git a/contrib/linux-kernel/lib/zstd/huf_compress.c b/contrib/linux-kernel/lib/zstd/huf_compress.c
index e82a136..0361f38 100644
--- a/contrib/linux-kernel/lib/zstd/huf_compress.c
+++ b/contrib/linux-kernel/lib/zstd/huf_compress.c
@@ -43,6 +43,7 @@
 #include "bitstream.h"
 #include "fse.h" /* header compression */
 #include "huf.h"
+#include <linux/kernel.h>
 #include <linux/string.h> /* memcpy, memset */
 
 /* **************************************************************
@@ -78,7 +79,7 @@
  * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX.
  */
 #define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6
-size_t HUF_compressWeights(void *dst, size_t dstSize, const void *weightTable, size_t wtSize)
+size_t HUF_compressWeights_wksp(void *dst, size_t dstSize, const void *weightTable, size_t wtSize, void *workspace, size_t workspaceSize)
 {
 	BYTE *const ostart = (BYTE *)dst;
 	BYTE *op = ostart;
@@ -87,11 +88,24 @@
 	U32 maxSymbolValue = HUF_TABLELOG_MAX;
 	U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
 
-	FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)];
-	BYTE scratchBuffer[1 << MAX_FSE_TABLELOG_FOR_HUFF_HEADER];
+	FSE_CTable *CTable;
+	U32 *count;
+	S16 *norm;
+	size_t spaceUsed32 = 0;
 
-	U32 count[HUF_TABLELOG_MAX + 1];
-	S16 norm[HUF_TABLELOG_MAX + 1];
+	HUF_STATIC_ASSERT(sizeof(FSE_CTable) == sizeof(U32));
+
+	CTable = (FSE_CTable *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX);
+	count = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += HUF_TABLELOG_MAX + 1;
+	norm = (S16 *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(sizeof(S16) * (HUF_TABLELOG_MAX + 1), sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
 
 	/* init conditions */
 	if (wtSize <= 1)
@@ -116,7 +130,7 @@
 	}
 
 	/* Compress */
-	CHECK_F(FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, sizeof(scratchBuffer)));
+	CHECK_F(FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, workspace, workspaceSize));
 	{
 		CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, weightTable, wtSize, CTable));
 		if (cSize == 0)
@@ -132,16 +146,28 @@
 	BYTE nbBits;
 }; /* typedef'd to HUF_CElt within "huf.h" */
 
-/*! HUF_writeCTable() :
+/*! HUF_writeCTable_wksp() :
 	`CTable` : Huffman tree to save, using huf representation.
 	@return : size of saved CTable */
-size_t HUF_writeCTable(void *dst, size_t maxDstSize, const HUF_CElt *CTable, U32 maxSymbolValue, U32 huffLog)
+size_t HUF_writeCTable_wksp(void *dst, size_t maxDstSize, const HUF_CElt *CTable, U32 maxSymbolValue, U32 huffLog, void *workspace, size_t workspaceSize)
 {
-	BYTE bitsToWeight[HUF_TABLELOG_MAX + 1]; /* precomputed conversion table */
-	BYTE huffWeight[HUF_SYMBOLVALUE_MAX];
 	BYTE *op = (BYTE *)dst;
 	U32 n;
 
+	BYTE *bitsToWeight;
+	BYTE *huffWeight;
+	size_t spaceUsed32 = 0;
+
+	bitsToWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(HUF_TABLELOG_MAX + 1, sizeof(U32)) >> 2;
+	huffWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX, sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
 	/* check conditions */
 	if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
 		return ERROR(maxSymbolValue_tooLarge);
@@ -155,7 +181,7 @@
 
 	/* attempt weights compression by FSE */
 	{
-		CHECK_V_F(hSize, HUF_compressWeights(op + 1, maxDstSize - 1, huffWeight, maxSymbolValue));
+		CHECK_V_F(hSize, HUF_compressWeights_wksp(op + 1, maxDstSize - 1, huffWeight, maxSymbolValue, workspace, workspaceSize));
 		if ((hSize > 1) & (hSize < maxSymbolValue / 2)) { /* FSE compressed */
 			op[0] = (BYTE)hSize;
 			return hSize + 1;
@@ -174,15 +200,29 @@
 	return ((maxSymbolValue + 1) / 2) + 1;
 }
 
-size_t HUF_readCTable(HUF_CElt *CTable, U32 maxSymbolValue, const void *src, size_t srcSize)
+size_t HUF_readCTable_wksp(HUF_CElt *CTable, U32 maxSymbolValue, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
 {
-	BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];  /* init not required, even though some static analyzer may complain */
-	U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for values from 0 to 16 */
+	U32 *rankVal;
+	BYTE *huffWeight;
 	U32 tableLog = 0;
 	U32 nbSymbols = 0;
+	size_t readSize;
+	size_t spaceUsed32 = 0;
+
+	rankVal = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
+	huffWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
 
 	/* get symbol weights */
-	CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize));
+	readSize = HUF_readStats_wksp(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize);
+	if (ERR_isError(readSize))
+		return readSize;
 
 	/* check result */
 	if (tableLog > HUF_TABLELOG_MAX)
@@ -680,7 +720,7 @@
 
 	/* Write table description header */
 	{
-		CHECK_V_F(hSize, HUF_writeCTable(op, dstSize, CTable, maxSymbolValue, huffLog));
+		CHECK_V_F(hSize, HUF_writeCTable_wksp(op, dstSize, CTable, maxSymbolValue, huffLog, workSpace, wkspSize));
 		/* Check if using the previous table will be beneficial */
 		if (repeat && *repeat != HUF_repeat_none) {
 			size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, count, maxSymbolValue);
diff --git a/contrib/linux-kernel/lib/zstd/huf_decompress.c b/contrib/linux-kernel/lib/zstd/huf_decompress.c
index 950c194..6526482 100644
--- a/contrib/linux-kernel/lib/zstd/huf_decompress.c
+++ b/contrib/linux-kernel/lib/zstd/huf_decompress.c
@@ -49,6 +49,7 @@
 #include "fse.h"       /* header compression */
 #include "huf.h"
 #include <linux/compiler.h>
+#include <linux/kernel.h>
 #include <linux/string.h> /* memcpy, memset */
 
 /* **************************************************************
@@ -86,20 +87,32 @@
 	BYTE nbBits;
 } HUF_DEltX2; /* single-symbol decoding */
 
-size_t HUF_readDTableX2(HUF_DTable *DTable, const void *src, size_t srcSize)
+size_t HUF_readDTableX2_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
 {
-	BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
-	U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for values from 0 to 16 */
 	U32 tableLog = 0;
 	U32 nbSymbols = 0;
 	size_t iSize;
 	void *const dtPtr = DTable + 1;
 	HUF_DEltX2 *const dt = (HUF_DEltX2 *)dtPtr;
 
+	U32 *rankVal;
+	BYTE *huffWeight;
+	size_t spaceUsed32 = 0;
+
+	rankVal = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
+	huffWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
 	HUF_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
 	/* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
 
-	iSize = HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+	iSize = HUF_readStats_wksp(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize);
 	if (HUF_isError(iSize))
 		return iSize;
 
@@ -216,11 +229,11 @@
 	return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
 }
 
-size_t HUF_decompress1X2_DCtx(HUF_DTable *DCtx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize)
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable *DCtx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
 {
 	const BYTE *ip = (const BYTE *)cSrc;
 
-	size_t const hSize = HUF_readDTableX2(DCtx, cSrc, cSrcSize);
+	size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, workspace, workspaceSize);
 	if (HUF_isError(hSize))
 		return hSize;
 	if (hSize >= cSrcSize)
@@ -347,11 +360,11 @@
 	return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
 }
 
-size_t HUF_decompress4X2_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize)
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
 {
 	const BYTE *ip = (const BYTE *)cSrc;
 
-	size_t const hSize = HUF_readDTableX2(dctx, cSrc, cSrcSize);
+	size_t const hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, workspace, workspaceSize);
 	if (HUF_isError(hSize))
 		return hSize;
 	if (hSize >= cSrcSize)
@@ -422,6 +435,7 @@
 }
 
 typedef U32 rankVal_t[HUF_TABLELOG_MAX][HUF_TABLELOG_MAX + 1];
+typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
 
 static void HUF_fillDTableX4(HUF_DEltX4 *DTable, const U32 targetLog, const sortedSymbol_t *sortedList, const U32 sortedListSize, const U32 *rankStart,
 			     rankVal_t rankValOrigin, const U32 maxWeight, const U32 nbBitsBaseline)
@@ -465,27 +479,50 @@
 	}
 }
 
-size_t HUF_readDTableX4(HUF_DTable *DTable, const void *src, size_t srcSize)
+size_t HUF_readDTableX4_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
 {
-	BYTE weightList[HUF_SYMBOLVALUE_MAX + 1];
-	sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1];
-	U32 rankStats[HUF_TABLELOG_MAX + 1] = {0};
-	U32 rankStart0[HUF_TABLELOG_MAX + 2] = {0};
-	U32 *const rankStart = rankStart0 + 1;
-	rankVal_t rankVal;
 	U32 tableLog, maxW, sizeOfSort, nbSymbols;
 	DTableDesc dtd = HUF_getDTableDesc(DTable);
 	U32 const maxTableLog = dtd.maxTableLog;
 	size_t iSize;
 	void *dtPtr = DTable + 1; /* force compiler to avoid strict-aliasing */
 	HUF_DEltX4 *const dt = (HUF_DEltX4 *)dtPtr;
+	U32 *rankStart;
+
+	rankValCol_t *rankVal;
+	U32 *rankStats;
+	U32 *rankStart0;
+	sortedSymbol_t *sortedSymbol;
+	BYTE *weightList;
+	size_t spaceUsed32 = 0;
+
+	HUF_STATIC_ASSERT((sizeof(rankValCol_t) & 3) == 0);
+
+	rankVal = (rankValCol_t *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2;
+	rankStats = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += HUF_TABLELOG_MAX + 1;
+	rankStart0 = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += HUF_TABLELOG_MAX + 2;
+	sortedSymbol = (sortedSymbol_t *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2;
+	weightList = (BYTE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
+	rankStart = rankStart0 + 1;
+	memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
 
 	HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */
 	if (maxTableLog > HUF_TABLELOG_MAX)
 		return ERROR(tableLog_tooLarge);
 	/* memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
 
-	iSize = HUF_readStats(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+	iSize = HUF_readStats_wksp(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize);
 	if (HUF_isError(iSize))
 		return iSize;
 
@@ -652,11 +689,11 @@
 	return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
 }
 
-size_t HUF_decompress1X4_DCtx(HUF_DTable *DCtx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize)
+size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable *DCtx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
 {
 	const BYTE *ip = (const BYTE *)cSrc;
 
-	size_t const hSize = HUF_readDTableX4(DCtx, cSrc, cSrcSize);
+	size_t const hSize = HUF_readDTableX4_wksp(DCtx, cSrc, cSrcSize, workspace, workspaceSize);
 	if (HUF_isError(hSize))
 		return hSize;
 	if (hSize >= cSrcSize)
@@ -785,11 +822,11 @@
 	return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
 }
 
-size_t HUF_decompress4X4_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize)
+size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
 {
 	const BYTE *ip = (const BYTE *)cSrc;
 
-	size_t hSize = HUF_readDTableX4(dctx, cSrc, cSrcSize);
+	size_t hSize = HUF_readDTableX4_wksp(dctx, cSrc, cSrcSize, workspace, workspaceSize);
 	if (HUF_isError(hSize))
 		return hSize;
 	if (hSize >= cSrcSize)
@@ -861,7 +898,7 @@
 
 typedef size_t (*decompressionAlgo)(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize);
 
-size_t HUF_decompress4X_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize)
+size_t HUF_decompress4X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
 {
 	/* validation checks */
 	if (dstSize == 0)
@@ -879,11 +916,12 @@
 
 	{
 		U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-		return algoNb ? HUF_decompress4X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) : HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
+		return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize)
+			      : HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize);
 	}
 }
 
-size_t HUF_decompress4X_hufOnly(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize)
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
 {
 	/* validation checks */
 	if (dstSize == 0)
@@ -893,11 +931,12 @@
 
 	{
 		U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-		return algoNb ? HUF_decompress4X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) : HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
+		return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize)
+			      : HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize);
 	}
 }
 
-size_t HUF_decompress1X_DCtx(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize)
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
 {
 	/* validation checks */
 	if (dstSize == 0)
@@ -915,6 +954,7 @@
 
 	{
 		U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-		return algoNb ? HUF_decompress1X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) : HUF_decompress1X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
+		return algoNb ? HUF_decompress1X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize)
+			      : HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize);
 	}
 }
diff --git a/contrib/linux-kernel/lib/zstd/zstd_common.c b/contrib/linux-kernel/lib/zstd/zstd_common.c
index 6ebf68d..a282624 100644
--- a/contrib/linux-kernel/lib/zstd/zstd_common.c
+++ b/contrib/linux-kernel/lib/zstd/zstd_common.c
@@ -51,7 +51,7 @@
 void *ZSTD_stackAllocAll(void *opaque, size_t *size)
 {
 	ZSTD_stack *stack = (ZSTD_stack *)opaque;
-	*size = stack->end - ZSTD_PTR_ALIGN(stack->ptr);
+	*size = (BYTE const *)stack->end - (BYTE *)ZSTD_PTR_ALIGN(stack->ptr);
 	return stack_push(stack, *size);
 }
 
diff --git a/contrib/linux-kernel/test/Makefile b/contrib/linux-kernel/test/Makefile
index 892264f..8411462 100644
--- a/contrib/linux-kernel/test/Makefile
+++ b/contrib/linux-kernel/test/Makefile
@@ -5,21 +5,21 @@
 OBJECTS := $(patsubst %.c,%.o,$(SOURCES))
 
 ARFLAGS := rcs
-CXXFLAGS += -std=c++11
-CFLAGS += -g -O0
+CXXFLAGS += -std=c++11 -g -O3 -Wcast-align
+CFLAGS += -g -O3 -Wframe-larger-than=400 -Wcast-align
 CPPFLAGS += $(IFLAGS)
 
 ../lib/zstd/libzstd.a: $(OBJECTS)
 	$(AR) $(ARFLAGS) $@ $^
 
 DecompressCrash: DecompressCrash.o $(OBJECTS) libFuzzer.a
-	$(CXX) $(TEST_CPPFLAGS) $(TEST_CXXFLAGS) $(LDFLAGS) $^ -o $@
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) $^ -o $@
 
 RoundTripCrash: RoundTripCrash.o $(OBJECTS) ../lib/xxhash.o libFuzzer.a
-	$(CXX) $(TEST_CPPFLAGS) $(TEST_CXXFLAGS) $(LDFLAGS) $^ -o $@
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) $^ -o $@
 
 UserlandTest: UserlandTest.cpp ../lib/zstd/libzstd.a ../lib/xxhash.o
-	$(CXX) $(CXXFLAGS) $(CFLAGS) $(CPPFLAGS) $^ googletest/build/googlemock/gtest/libgtest.a googletest/build/googlemock/gtest/libgtest_main.a -o $@
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $^ googletest/build/googlemock/gtest/libgtest.a googletest/build/googlemock/gtest/libgtest_main.a -o $@
 
 XXHashUserlandTest: XXHashUserlandTest.cpp ../lib/xxhash.o ../../../lib/common/xxhash.o
 	$(CXX) $(CXXFLAGS) $(CFLAGS) $(CPPFLAGS) $^ googletest/build/googlemock/gtest/libgtest.a googletest/build/googlemock/gtest/libgtest_main.a -o $@
@@ -39,5 +39,5 @@
 	@cd googletest/build && cmake .. && $(MAKE)
 
 clean:
-	$(RM) -f *.{o,a} ../lib/zstd/*.{o,a}
+	$(RM) -f *.{o,a} ../lib/zstd/*.{o,a} ../lib/*.o
 	$(RM) -f DecompressCrash RoundTripCrash UserlandTest XXHashUserlandTest
diff --git a/contrib/linux-kernel/test/include/linux/math64.h b/contrib/linux-kernel/test/include/linux/math64.h
new file mode 100644
index 0000000..3d0ae72
--- /dev/null
+++ b/contrib/linux-kernel/test/include/linux/math64.h
@@ -0,0 +1,11 @@
+#ifndef LINUX_MATH64_H
+#define LINUX_MATH64_H
+
+#include <stdint.h>
+
+static uint64_t div_u64(uint64_t n, uint32_t d)
+{
+  return n / d;
+}
+
+#endif
diff --git a/doc/zstd_manual.html b/doc/zstd_manual.html
index 63704e6..cd2b06d 100644
--- a/doc/zstd_manual.html
+++ b/doc/zstd_manual.html
@@ -332,7 +332,7 @@
 </b></pre><BR>
 <pre><b>typedef struct {
     unsigned long long frameContentSize;
-    unsigned windowSize;
+    size_t windowSize;
     unsigned dictID;
     unsigned checksumFlag;
 } ZSTD_frameHeader;
@@ -389,6 +389,12 @@
             however it does mean that all frame data must be present and valid. 
 </p></pre><BR>
 
+<pre><b>size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+</b><p>   `src` should point to the start of a ZSTD frame
+   `srcSize` must be >= ZSTD_frameHeaderSize_prefix.
+   @return : size of the Frame Header 
+</p></pre><BR>
+
 <a name="Chapter13"></a><h2>Context memory usage</h2><pre></pre>
 
 <pre><b>size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx);
@@ -401,25 +407,41 @@
   Object memory usage can evolve if it's re-used multiple times. 
 </p></pre><BR>
 
-<pre><b>size_t ZSTD_estimateCCtxSize(ZSTD_compressionParameters cParams);
+<pre><b>size_t ZSTD_estimateCCtxSize(int compressionLevel);
+size_t ZSTD_estimateCCtxSize_advanced(ZSTD_compressionParameters cParams);
 size_t ZSTD_estimateDCtxSize(void);
 </b><p>  These functions make it possible to estimate memory usage
-  of a future target object, before its allocation,
-  given a set of parameters, which vary depending on target object.
-  The objective is to guide decision before allocation.
+  of a future {D,C}Ctx, before its creation.
+  ZSTD_estimateCCtxSize() will provide a budget large enough for any compression level up to selected one.
+  It will also consider src size to be arbitrarily "large", which is worst case.
+  If srcSize is known to always be small, ZSTD_estimateCCtxSize_advanced() can provide a tighter estimation.
+  ZSTD_estimateCCtxSize_advanced() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
   Note : CCtx estimation is only correct for single-threaded compression 
 </p></pre><BR>
 
-<pre><b>size_t ZSTD_estimateCStreamSize(ZSTD_compressionParameters cParams);
-size_t ZSTD_estimateDStreamSize(ZSTD_frameHeader fHeader);
-</b><p>  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
-         an internal ?Dict will be created, which size is not estimated here.
+<pre><b>size_t ZSTD_estimateCStreamSize(int compressionLevel);
+size_t ZSTD_estimateCStreamSize_advanced(ZSTD_compressionParameters cParams);
+size_t ZSTD_estimateDStreamSize(size_t windowSize);
+size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
+</b><p>  ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
+  It will also consider src size to be arbitrarily "large", which is worst case.
+  If srcSize is known to always be small, ZSTD_estimateCStreamSize_advanced() can provide a tighter estimation.
+  ZSTD_estimateCStreamSize_advanced() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
+  Note : CStream estimation is only correct for single-threaded compression.
+  ZSTD_DStream memory budget depends on window Size.
+  This information can be passed manually, using ZSTD_estimateDStreamSize,
+  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
+  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
+         an internal ?Dict will be created, which additional size is not estimated here.
          In this case, get total size by adding ZSTD_estimate?DictSize 
 </p></pre><BR>
 
-<pre><b>size_t ZSTD_estimateCDictSize(ZSTD_compressionParameters cParams, size_t dictSize, unsigned byReference);
+<pre><b>size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel);
+size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, unsigned byReference);
 size_t ZSTD_estimateDDictSize(size_t dictSize, unsigned byReference);
-</b><p>  Note : dictionary created "byReference" are smaller 
+</b><p>  ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict().
+  ZSTD_estimateCStreamSize_advanced() makes it possible to control precisely compression parameters, like ZSTD_createCDict_advanced().
+  Note : dictionary created "byReference" are smaller 
 </p></pre><BR>
 
 <a name="Chapter14"></a><h2>Advanced compression functions</h2><pre></pre>
@@ -461,7 +483,10 @@
   It is important that dictBuffer outlives CDict, it must remain read accessible throughout the lifetime of CDict 
 </p></pre><BR>
 
-<pre><b>typedef enum { ZSTD_dm_auto=0, ZSTD_dm_rawContent, ZSTD_dm_fullDict } ZSTD_dictMode_e;
+<pre><b>typedef enum { ZSTD_dm_auto=0,        </b>/* dictionary is "full" if it starts with ZSTD_MAGIC_DICTIONARY, rawContent otherwize */<b>
+               ZSTD_dm_rawContent,    </b>/* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */<b>
+               ZSTD_dm_fullDict       </b>/* refuses to load a dictionary if it does not respect Zstandard's specification */<b>
+} ZSTD_dictMode_e;
 </b></pre><BR>
 <pre><b>ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize,
                                       unsigned byReference, ZSTD_dictMode_e dictMode,
@@ -606,6 +631,7 @@
 <a name="Chapter16"></a><h2>Advanced streaming functions</h2><pre></pre>
 
 <h3>Advanced Streaming compression functions</h3><pre></pre><b><pre>ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
+ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize);    </b>/**< same as ZSTD_initStaticCCtx() */<b>
 size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize);   </b>/**< pledgedSrcSize must be correct, a size of 0 means unknown.  for a frame size of 0 use initCStream_advanced */<b>
 size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); </b>/**< creates of an internal CDict (incompatible with static CCtx), except if dict == NULL or dictSize < 8, in which case no dict is used. */<b>
 size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, const void* dict, size_t dictSize,
@@ -624,6 +650,7 @@
 
 <h3>Advanced Streaming decompression functions</h3><pre></pre><b><pre>typedef enum { DStream_p_maxWindowSize } ZSTD_DStreamParameter_e;
 ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem);
+ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize);    </b>/**< same as ZSTD_initStaticDCtx() */<b>
 size_t ZSTD_setDStreamParameter(ZSTD_DStream* zds, ZSTD_DStreamParameter_e paramType, unsigned paramValue);
 size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); </b>/**< note: a dict will not be used if dict == NULL or dictSize < 8 */<b>
 size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);  </b>/**< note : ddict will just be referenced, and must outlive decompression session */<b>
@@ -678,16 +705,18 @@
   A ZSTD_DCtx object can be re-used multiple times.
 
   First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
-  It fills a ZSTD_frameParams structure which provide important information to correctly decode the frame,
-  such as the minimum rolling buffer size to allocate to decompress data (`windowSize`),
-  and the dictionary ID used.
+  It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
+  such as minimum rolling buffer size to allocate to decompress data (`windowSize`),
+  and the dictionary ID in use.
   (Note : content size is optional, it may not be present. 0 means : content size unknown).
   Note that these values could be wrong, either because of data malformation, or because an attacker is spoofing deliberate false information.
   As a consequence, check that values remain within valid application range, especially `windowSize`, before allocation.
-  Each application can set its own limit, depending on local restrictions. For extended interoperability, it is recommended to support at least 8 MB.
-  Frame parameters are extracted from the beginning of the compressed frame.
-  Data fragment must be large enough to ensure successful decoding, typically `ZSTD_frameHeaderSize_max` bytes.
-  @result : 0 : successful decoding, the `ZSTD_frameParams` structure is correctly filled.
+  Each application can set its own limit, depending on local restrictions.
+  For extended interoperability, it is recommended to support windowSize of at least 8 MB.
+  Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+  Data fragment must be large enough to ensure successful decoding.
+  `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
            >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
            errorCode, which can be tested using ZSTD_isError().
 
@@ -731,7 +760,7 @@
   It also returns Frame Size as fparamsPtr->frameContentSize.
 <BR></pre>
 
-<h3>Buffer-less streaming decompression functions</h3><pre></pre><b><pre>size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   </b>/**< doesn't consume input, see details below */<b>
+<h3>Buffer-less streaming decompression functions</h3><pre></pre><b><pre>size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   </b>/**< doesn't consume input */<b>
 size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
 size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
 size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
@@ -778,26 +807,17 @@
                               * The higher the value of selected strategy, the more complex it is,
                               * resulting in stronger and slower compression.
                               * Special: value 0 means "do not change strategy". */
-#if 0
-    ZSTD_p_windowSize,       </b>/* Maximum allowed back-reference distance.<b>
-                              * Can be set to a more precise value than windowLog.
-                              * Will be transparently reduced to closest possible inferior value
-                              * (see Zstandard compression format) */
-                             </b>/* Not ready yet ! */<b>
-#endif
 
     </b>/* frame parameters */<b>
     ZSTD_p_contentSizeFlag=200, </b>/* Content size is written into frame header _whenever known_ (default:1) */<b>
     ZSTD_p_checksumFlag,     </b>/* A 32-bits checksum of content is written at end of frame (default:0) */<b>
     ZSTD_p_dictIDFlag,       </b>/* When applicable, dictID of dictionary is provided in frame header (default:1) */<b>
 
-    </b>/* dictionary parameters */<b>
-    ZSTD_p_refDictContent=300, </b>/* Content of dictionary content will be referenced, instead of copied (default:0).<b>
-                              * This avoids duplicating dictionary content.
-                              * But it also requires that dictionary buffer outlives its users */
-                             </b>/* Not ready yet ! <=================================== */<b>
-    ZSTD_p_dictMode,         </b>/* Select how dictionary must be interpreted. Value must be from type ZSTD_dictMode_e.<b>
+    </b>/* dictionary parameters (must be set before ZSTD_CCtx_loadDictionary) */<b>
+    ZSTD_p_dictMode=300,     </b>/* Select how dictionary content must be interpreted. Value must be from type ZSTD_dictMode_e.<b>
                               * default : 0==auto : dictionary will be "full" if it respects specification, otherwise it will be "rawContent" */
+    ZSTD_p_refDictContent,   </b>/* Dictionary content will be referenced, instead of copied (default:0==byCopy).<b>
+                              * It requires that dictionary buffer outlives its users */
 
     </b>/* multi-threading parameters */<b>
     ZSTD_p_nbThreads=400,    </b>/* Select how many threads a compression job can spawn (default:1)<b>
@@ -812,9 +832,9 @@
                               * 0 => no overlap, 6(default) => use 1/8th of windowSize, >=9 => use full windowSize */
 
     </b>/* advanced parameters - may not remain available after API update */<b>
-    ZSTD_p_forceMaxWindow=1100, </b>/* Force back-references to remain < windowSize,<b>
-                              * even when referencing into Dictionary content.
-                              * default : 0 when using a CDict, 1 when using a Prefix */
+    ZSTD_p_forceMaxWindow=1100, </b>/* Force back-reference distances to remain < windowSize,<b>
+                              * even when referencing into Dictionary content (default:0) */
+
 } ZSTD_cParameter;
 </b></pre><BR>
 <pre><b>size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned value);
@@ -840,8 +860,8 @@
  @result : 0, or an error code (which can be tested with ZSTD_isError()).
   Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
             meaning "return to no-dictionary mode".
-  Note 1 : Dictionary content will be copied internally,
-           except if ZSTD_p_refDictContent is set.
+  Note 1 : `dict` content will be copied internally,
+           except if ZSTD_p_refDictContent is set before loading.
   Note 2 : Loading a dictionary involves building tables, which are dependent on compression parameters.
            For this reason, compression parameters cannot be changed anymore after loading a dictionary.
            It's also a CPU-heavy operation, with non-negligible impact on latency.
@@ -850,7 +870,7 @@
 </p></pre><BR>
 
 <pre><b>size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
-</b><p>  Ref a prepared dictionary, to be used for all next compression jobs.
+</b><p>  Reference a prepared dictionary, to be used for all next compression jobs.
   Note that compression parameters are enforced from within CDict,
   and supercede any compression parameter previously set within CCtx.
   The dictionary will remain valid for future compression jobs using same CCtx.
@@ -862,16 +882,18 @@
  
 </p></pre><BR>
 
-<pre><b>size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize);   </b>/* Not ready yet ! <===================================== */<b>
-</b><p>  Reference a prefix (content-only dictionary) to bootstrap next compression job.
-  Decompression will have to use same prefix.
-  Prefix is only used once. Tables are discarded at end of compression job.
-  If there is a need to use same prefix multiple times, consider embedding it into a ZSTD_CDict.
+<pre><b>size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize);
+</b><p>  Reference a prefix (single-usage dictionary) for next compression job.
+  Decompression need same prefix to properly regenerate data.
+  Prefix is **only used once**. Tables are discarded at end of compression job.
+  Subsequent compression jobs will be done without prefix (if none is explicitly referenced).
+  If there is a need to use same prefix multiple times, consider embedding it into a ZSTD_CDict instead.
  @result : 0, or an error code (which can be tested with ZSTD_isError()).
-  Special : Adding a NULL (or 0-size) dictionary invalidates any previous prefix, meaning "return to no-dictionary mode".
+  Special : Adding any prefix (including NULL) invalidates any previous prefix or dictionary
   Note 1 : Prefix buffer is referenced. It must outlive compression job.
   Note 2 : Referencing a prefix involves building tables, which are dependent on compression parameters.
-           It's a CPU-heavy operation, with non-negligible impact on latency. 
+           It's a CPU-heavy operation, with non-negligible impact on latency.
+  Note 3 : it's possible to alter ZSTD_p_dictMode using ZSTD_CCtx_setParameter() 
 </p></pre><BR>
 
 <pre><b>typedef enum {
diff --git a/lib/common/huf.h b/lib/common/huf.h
index 7873ca3..dabd359 100644
--- a/lib/common/huf.h
+++ b/lib/common/huf.h
@@ -111,6 +111,18 @@
 #define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32))
 HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
 
+/**
+ *  The minimum workspace size for the `workSpace` used in
+ *  HUF_readDTableX2_wksp() and HUF_readDTableX4_wksp().
+ *
+ *  The space used depends on HUF_TABLELOG_MAX, ranging from ~1500 bytes when
+ *  HUF_TABLE_LOG_MAX=12 to ~1850 bytes when HUF_TABLE_LOG_MAX=15.
+ *  Buffer overflow errors may potentially occur if code modifications result in
+ *  a required workspace size greater than that specified in the following
+ *  macro.
+ */
+#define HUF_DECOMPRESS_WORKSPACE_SIZE (2 << 10)
+#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
 
 
 /* ******************************************************************
@@ -170,8 +182,11 @@
 
 size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< decodes RLE and uncompressed */
 size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< considers RLE and uncompressed as errors */
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< considers RLE and uncompressed as errors */
 size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
 size_t HUF_decompress4X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
 
 
 /* ****************************************
@@ -243,7 +258,9 @@
 U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
 
 size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
+size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
 size_t HUF_readDTableX4 (HUF_DTable* DTable, const void* src, size_t srcSize);
+size_t HUF_readDTableX4_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
 
 size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
 size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
@@ -266,8 +283,11 @@
 size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
 
 size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
 size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
 size_t HUF_decompress1X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
 
 size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /**< automatic selection of sing or double symbol decoder, based on DTable */
 size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h
index 4e216d7..b2c7cfc 100644
--- a/lib/common/zstd_internal.h
+++ b/lib/common/zstd_internal.h
@@ -71,12 +71,17 @@
 
 #if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=2)
 #  include <stdio.h>
-   static unsigned g_debugLevel = ZSTD_DEBUG;
-#  define DEBUGLOG(l, ...) {                          \
-                if (l<=g_debugLevel) {                \
-                    fprintf(stderr, __FILE__ ": ");   \
-                    fprintf(stderr, __VA_ARGS__);     \
-                    fprintf(stderr, " \n");           \
+/* recommended values for ZSTD_DEBUG display levels :
+ * 2 : reserved for currently active debugging path
+ * 3 : events once per object lifetime (CCtx, CDict)
+ * 4 : events once per frame
+ * 5 : events once per block
+ * 6 : events once per sequence (*very* verbose) */
+#  define DEBUGLOG(l, ...) {                         \
+                if (l<=ZSTD_DEBUG) {                 \
+                    fprintf(stderr, __FILE__ ": ");  \
+                    fprintf(stderr, __VA_ARGS__);    \
+                    fprintf(stderr, " \n");          \
             }   }
 #else
 #  define DEBUGLOG(l, ...)      {}    /* disabled */
@@ -98,7 +103,6 @@
 *  Common constants
 ***************************************/
 #define ZSTD_OPT_NUM    (1<<12)
-#define ZSTD_DICT_MAGIC  0xEC30A437   /* v0.7+ */
 
 #define ZSTD_REP_NUM      3                 /* number of repcodes */
 #define ZSTD_REP_CHECK    (ZSTD_REP_NUM)    /* number of repcodes to check by the optimal parser */
diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c
index e534960..f492d92 100644
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@@ -88,7 +88,6 @@
     U32   hashLog3;         /* dispatch table : larger == faster, more memory */
     U32   loadedDictEnd;    /* index of end of dictionary */
     U32   forceWindow;      /* force back-references to respect limit of 1<<wLog, even for dictionary */
-    ZSTD_dictMode_e dictMode; /* select restricting dictionary to "rawContent" or "fullDict" only */
     ZSTD_compressionStage_e stage;
     U32   rep[ZSTD_REP_NUM];
     U32   repToConfirm[ZSTD_REP_NUM];
@@ -118,8 +117,6 @@
     unsigned* entropyScratchSpace;
 
     /* streaming */
-    ZSTD_CDict* cdictLocal;
-    const ZSTD_CDict* cdict;
     char*  inBuff;
     size_t inBuffSize;
     size_t inToCompress;
@@ -132,6 +129,14 @@
     ZSTD_cStreamStage streamStage;
     U32    frameEnded;
 
+    /* Dictionary */
+    ZSTD_dictMode_e dictMode; /* select restricting dictionary to "rawContent" or "fullDict" only */
+    U32 dictContentByRef;
+    ZSTD_CDict* cdictLocal;
+    const ZSTD_CDict* cdict;
+    const void* prefix;
+    size_t prefixSize;
+
     /* Multi-threading */
     U32 nbThreads;
     ZSTDMT_CCtx* mtctx;
@@ -322,11 +327,6 @@
         cctx->requestedParams.cParams.strategy = (ZSTD_strategy)value;
         return 0;
 
-#if 0
-    case ZSTD_p_windowSize :   /* to be done later */
-        return ERROR(compressionParameter_unsupported);
-#endif
-
     case ZSTD_p_contentSizeFlag :
         DEBUGLOG(5, "set content size flag = %u", (value>0));
         /* Content size written in frame header _when known_ (default:1) */
@@ -343,10 +343,9 @@
         cctx->requestedParams.fParams.noDictIDFlag = (value==0);
         return 0;
 
-    case ZSTD_p_refDictContent :   /* to be done later */
-        return ERROR(compressionParameter_unsupported);
-
+    /* Dictionary parameters */
     case ZSTD_p_dictMode :
+        if (cctx->cdict) return ERROR(stage_wrong);  /* must be set before loading */
         /* restrict dictionary mode, to "rawContent" or "fullDict" only */
         ZSTD_STATIC_ASSERT((U32)ZSTD_dm_fullDict > (U32)ZSTD_dm_rawContent);
         if (value > (unsigned)ZSTD_dm_fullDict)
@@ -354,6 +353,12 @@
         cctx->dictMode = (ZSTD_dictMode_e)value;
         return 0;
 
+    case ZSTD_p_refDictContent :
+        if (cctx->cdict) return ERROR(stage_wrong);  /* must be set before loading */
+        /* dictionary content will be referenced, instead of copied */
+        cctx->dictContentByRef = value>0;
+        return 0;
+
     case ZSTD_p_forceMaxWindow :  /* Force back-references to remain < windowSize,
                                    * even when referencing into Dictionary content
                                    * default : 0 when using a CDict, 1 when using a Prefix */
@@ -417,7 +422,7 @@
                 ZSTD_getCParams(cctx->compressionLevel, 0, dictSize);
         cctx->cdictLocal = ZSTD_createCDict_advanced(
                                 dict, dictSize,
-                                0 /* byReference */, cctx->dictMode,
+                                cctx->dictContentByRef, cctx->dictMode,
                                 cParams, cctx->customMem);
         cctx->cdict = cctx->cdictLocal;
         if (cctx->cdictLocal == NULL)
@@ -426,19 +431,22 @@
     return 0;
 }
 
-/* Not ready yet ! */
-size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize)
-{
-    (void)cctx; (void)prefix; (void)prefixSize; /* to be done later */
-    if (cctx->streamStage != zcss_init) return ERROR(stage_wrong);
-    return ERROR(compressionParameter_unsupported);
-}
-
 size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
 {
     if (cctx->streamStage != zcss_init) return ERROR(stage_wrong);
     cctx->cdict = cdict;
-    return ERROR(compressionParameter_unsupported);
+    cctx->prefix = NULL;   /* exclusive */
+    cctx->prefixSize = 0;
+    return 0;
+}
+
+size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize)
+{
+    if (cctx->streamStage != zcss_init) return ERROR(stage_wrong);
+    cctx->cdict = NULL;   /* prefix discards any prior cdict */
+    cctx->prefix = prefix;
+    cctx->prefixSize = prefixSize;
+    return 0;
 }
 
 static void ZSTD_startNewCompression(ZSTD_CCtx* cctx)
@@ -532,7 +540,7 @@
 }
 
 
-size_t ZSTD_estimateCCtxSize(ZSTD_compressionParameters cParams)
+size_t ZSTD_estimateCCtxSize_advanced(ZSTD_compressionParameters cParams)
 {
     size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
     U32    const divider = (cParams.searchLength==3) ? 3 : 4;
@@ -558,9 +566,15 @@
     return sizeof(ZSTD_CCtx) + neededSpace;
 }
 
-size_t ZSTD_estimateCStreamSize(ZSTD_compressionParameters cParams)
+size_t ZSTD_estimateCCtxSize(int compressionLevel)
 {
-    size_t const CCtxSize = ZSTD_estimateCCtxSize(cParams);
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, 0);
+    return ZSTD_estimateCCtxSize_advanced(cParams);
+}
+
+size_t ZSTD_estimateCStreamSize_advanced(ZSTD_compressionParameters cParams)
+{
+    size_t const CCtxSize = ZSTD_estimateCCtxSize_advanced(cParams);
     size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
     size_t const inBuffSize = ((size_t)1 << cParams.windowLog) + blockSize;
     size_t const outBuffSize = ZSTD_compressBound(blockSize) + 1;
@@ -569,6 +583,11 @@
     return CCtxSize + streamingSize;
 }
 
+size_t ZSTD_estimateCStreamSize(int compressionLevel) {
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, 0);
+    return ZSTD_estimateCStreamSize_advanced(cParams);
+}
+
 
 static U32 ZSTD_equivalentParams(ZSTD_compressionParameters cParams1,
                                  ZSTD_compressionParameters cParams2)
@@ -3169,7 +3188,7 @@
     if (dictMode==ZSTD_dm_rawContent)
         return ZSTD_loadDictionaryContent(cctx, dict, dictSize);
 
-    if (MEM_readLE32(dict) != ZSTD_DICT_MAGIC) {
+    if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
         if (dictMode == ZSTD_dm_auto) {
             DEBUGLOG(5, "raw content dictionary detected");
             return ZSTD_loadDictionaryContent(cctx, dict, dictSize);
@@ -3350,16 +3369,22 @@
 
 /* =====  Dictionary API  ===== */
 
-/*! ZSTD_estimateCDictSize() :
+/*! ZSTD_estimateCDictSize_advanced() :
  *  Estimate amount of memory that will be needed to create a dictionary with following arguments */
-size_t ZSTD_estimateCDictSize(ZSTD_compressionParameters cParams, size_t dictSize, unsigned byReference)
+size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, unsigned byReference)
 {
     DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (U32)sizeof(ZSTD_CDict));
-    DEBUGLOG(5, "CCtx estimate : %u", (U32)ZSTD_estimateCCtxSize(cParams));
-    return sizeof(ZSTD_CDict) + ZSTD_estimateCCtxSize(cParams)
+    DEBUGLOG(5, "CCtx estimate : %u", (U32)ZSTD_estimateCCtxSize_advanced(cParams));
+    return sizeof(ZSTD_CDict) + ZSTD_estimateCCtxSize_advanced(cParams)
            + (byReference ? 0 : dictSize);
 }
 
+size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel)
+{
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, dictSize);
+    return ZSTD_estimateCDictSize_advanced(dictSize, cParams, 0);
+}
+
 size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict)
 {
     if (cdict==NULL) return 0;   /* support sizeof on NULL */
@@ -3482,7 +3507,7 @@
                                  unsigned byReference, ZSTD_dictMode_e dictMode,
                                  ZSTD_compressionParameters cParams)
 {
-    size_t const cctxSize = ZSTD_estimateCCtxSize(cParams);
+    size_t const cctxSize = ZSTD_estimateCCtxSize_advanced(cParams);
     size_t const neededSize = sizeof(ZSTD_CDict) + (byReference ? 0 : dictSize)
                             + cctxSize;
     ZSTD_CDict* const cdict = (ZSTD_CDict*) workspace;
@@ -3577,6 +3602,11 @@
     return ZSTD_createCStream_advanced(ZSTD_defaultCMem);
 }
 
+ZSTD_CStream* ZSTD_initStaticCStream(void *workspace, size_t workspaceSize)
+{
+    return ZSTD_initStaticCCtx(workspace, workspaceSize);
+}
+
 ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem)
 {   /* CStream and CCtx are now same object */
     return ZSTD_createCCtx_advanced(customMem);
@@ -3599,14 +3629,18 @@
 }
 
 static size_t ZSTD_resetCStream_internal(ZSTD_CStream* zcs,
-                                        ZSTD_parameters params,
-                                        unsigned long long pledgedSrcSize)
+                    const void* dict, size_t dictSize, ZSTD_dictMode_e dictMode,
+                    const ZSTD_CDict* cdict,
+                    ZSTD_parameters params, unsigned long long pledgedSrcSize)
 {
     DEBUGLOG(5, "ZSTD_resetCStream_internal");
+    /* params are supposed to be fully validated at this point */
+    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+    assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
 
     CHECK_F( ZSTD_compressBegin_internal(zcs,
-                                        NULL, 0, ZSTD_dm_auto,
-                                        zcs->cdict,
+                                        dict, dictSize, dictMode,
+                                        cdict,
                                         params, pledgedSrcSize,
                                         ZSTDb_buffered) );
 
@@ -3627,10 +3661,11 @@
     if (zcs->compressionLevel != ZSTD_CLEVEL_CUSTOM) {
         params.cParams = ZSTD_getCParams(zcs->compressionLevel, pledgedSrcSize, 0 /* dictSize */);
     }
-    return ZSTD_resetCStream_internal(zcs, params, pledgedSrcSize);
+    return ZSTD_resetCStream_internal(zcs, NULL, 0, zcs->dictMode, zcs->cdict, params, pledgedSrcSize);
 }
 
 /*! ZSTD_initCStream_internal() :
+ *  Note : not static, but hidden (not exposed). Used by zstdmt_compress.c
  *  Assumption 1 : params are valid
  *  Assumption 2 : either dict, or cdict, is defined, not both */
 size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
@@ -3649,7 +3684,7 @@
         }
         ZSTD_freeCDict(zcs->cdictLocal);
         zcs->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize,
-                                            0 /* byReference */, ZSTD_dm_auto,
+                                            zcs->dictContentByRef, zcs->dictMode,
                                             params.cParams, zcs->customMem);
         zcs->cdict = zcs->cdictLocal;
         if (zcs->cdictLocal == NULL) return ERROR(memory_allocation);
@@ -3665,7 +3700,7 @@
 
     zcs->requestedParams = params;
     zcs->compressionLevel = ZSTD_CLEVEL_CUSTOM;
-    return ZSTD_resetCStream_internal(zcs, params, pledgedSrcSize);
+    return ZSTD_resetCStream_internal(zcs, NULL, 0, zcs->dictMode, zcs->cdict, params, pledgedSrcSize);
 }
 
 /* ZSTD_initCStream_usingCDict_advanced() :
@@ -3889,26 +3924,30 @@
 
     if (cctx->streamStage == zcss_init) {
         /* transparent reset */
+        const void* const prefix = cctx->prefix;
+        size_t const prefixSize = cctx->prefixSize;
         ZSTD_parameters params = cctx->requestedParams;
         if (cctx->compressionLevel != ZSTD_CLEVEL_CUSTOM)
             params.cParams = ZSTD_getCParams(cctx->compressionLevel,
-                                    cctx->pledgedSrcSizePlusOne-1, 0 /* dictSize */);
+                                cctx->pledgedSrcSizePlusOne-1, 0 /*dictSize*/);
+        cctx->prefix = NULL; cctx->prefixSize = 0;   /* single usage */
+        assert(prefix==NULL || cctx->cdict==NULL);   /* only one can be set */
 
 #ifdef ZSTD_MULTITHREAD
         if (cctx->nbThreads > 1) {
-            DEBUGLOG(4, "call ZSTDMT_initCStream_internal");
-            CHECK_F( ZSTDMT_initCStream_internal(cctx->mtctx, NULL, 0, cctx->cdict, params, cctx->pledgedSrcSizePlusOne-1) );
+            DEBUGLOG(4, "call ZSTDMT_initCStream_internal as nbThreads=%u", cctx->nbThreads);
+            CHECK_F( ZSTDMT_initCStream_internal(cctx->mtctx, prefix, prefixSize, cctx->cdict, params, cctx->pledgedSrcSizePlusOne-1) );
             cctx->streamStage = zcss_load;
         } else
 #endif
         {
-            CHECK_F( ZSTD_resetCStream_internal(cctx, params, cctx->pledgedSrcSizePlusOne-1) );
+            CHECK_F( ZSTD_resetCStream_internal(cctx, prefix, prefixSize, cctx->dictMode, cctx->cdict, params, cctx->pledgedSrcSizePlusOne-1) );
     }   }
 
 #ifdef ZSTD_MULTITHREAD
     if (cctx->nbThreads > 1) {
         size_t const flushMin = ZSTDMT_compressStream_generic(cctx->mtctx, output, input, endOp);
-        DEBUGLOG(4, "ZSTDMT_compressStream_generic : %u", (U32)flushMin);
+        DEBUGLOG(5, "ZSTDMT_compressStream_generic : %u", (U32)flushMin);
         if ( ZSTD_isError(flushMin)
           || (endOp == ZSTD_e_end && flushMin == 0) ) { /* compression completed */
             ZSTD_startNewCompression(cctx);
@@ -3970,30 +4009,30 @@
 int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
 
 static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = {
-{   /* "default" */
+{   /* "default" - guarantees a monotonically increasing memory budget */
     /* W,  C,  H,  S,  L, TL, strat */
     { 18, 12, 12,  1,  7, 16, ZSTD_fast    },  /* level  0 - never used */
     { 19, 13, 14,  1,  7, 16, ZSTD_fast    },  /* level  1 */
     { 19, 15, 16,  1,  6, 16, ZSTD_fast    },  /* level  2 */
-    { 20, 16, 17,  1,  5, 16, ZSTD_dfast   },  /* level  3.*/
-    { 20, 18, 18,  1,  5, 16, ZSTD_dfast   },  /* level  4.*/
-    { 20, 15, 18,  3,  5, 16, ZSTD_greedy  },  /* level  5 */
-    { 21, 16, 19,  2,  5, 16, ZSTD_lazy    },  /* level  6 */
-    { 21, 17, 20,  3,  5, 16, ZSTD_lazy    },  /* level  7 */
+    { 20, 16, 17,  1,  5, 16, ZSTD_dfast   },  /* level  3 */
+    { 20, 17, 18,  1,  5, 16, ZSTD_dfast   },  /* level  4 */
+    { 20, 17, 18,  2,  5, 16, ZSTD_greedy  },  /* level  5 */
+    { 21, 17, 19,  2,  5, 16, ZSTD_lazy    },  /* level  6 */
+    { 21, 18, 19,  3,  5, 16, ZSTD_lazy    },  /* level  7 */
     { 21, 18, 20,  3,  5, 16, ZSTD_lazy2   },  /* level  8 */
-    { 21, 20, 20,  3,  5, 16, ZSTD_lazy2   },  /* level  9 */
+    { 21, 19, 20,  3,  5, 16, ZSTD_lazy2   },  /* level  9 */
     { 21, 19, 21,  4,  5, 16, ZSTD_lazy2   },  /* level 10 */
     { 22, 20, 22,  4,  5, 16, ZSTD_lazy2   },  /* level 11 */
     { 22, 20, 22,  5,  5, 16, ZSTD_lazy2   },  /* level 12 */
     { 22, 21, 22,  5,  5, 16, ZSTD_lazy2   },  /* level 13 */
     { 22, 21, 22,  6,  5, 16, ZSTD_lazy2   },  /* level 14 */
-    { 22, 21, 21,  5,  5, 16, ZSTD_btlazy2 },  /* level 15 */
+    { 22, 21, 22,  5,  5, 16, ZSTD_btlazy2 },  /* level 15 */
     { 23, 22, 22,  5,  5, 16, ZSTD_btlazy2 },  /* level 16 */
-    { 23, 21, 22,  4,  5, 24, ZSTD_btopt   },  /* level 17 */
+    { 23, 22, 22,  4,  5, 24, ZSTD_btopt   },  /* level 17 */
     { 23, 22, 22,  5,  4, 32, ZSTD_btopt   },  /* level 18 */
     { 23, 23, 22,  6,  3, 48, ZSTD_btopt   },  /* level 19 */
     { 25, 25, 23,  7,  3, 64, ZSTD_btultra },  /* level 20 */
-    { 26, 26, 23,  7,  3,256, ZSTD_btultra },  /* level 21 */
+    { 26, 26, 24,  7,  3,256, ZSTD_btultra },  /* level 21 */
     { 27, 27, 25,  9,  3,512, ZSTD_btultra },  /* level 22 */
 },
 {   /* for srcSize <= 256 KB */
@@ -4076,6 +4115,25 @@
 },
 };
 
+#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
+/* This function just controls
+ * the monotonic memory budget increase of ZSTD_defaultCParameters[0].
+ * Run only once, on first ZSTD_getCParams() usage, when ZSTD_DEBUG is enabled
+ */
+MEM_STATIC void ZSTD_check_compressionLevel_monotonicIncrease_memoryBudget(void)
+{
+    int level;
+    for (level=1; level<ZSTD_maxCLevel(); level++) {
+        ZSTD_compressionParameters const c1 = ZSTD_defaultCParameters[0][level];
+        ZSTD_compressionParameters const c2 = ZSTD_defaultCParameters[0][level+1];
+        DEBUGLOG(3, "controlling compression params level %i", level);
+        assert(c1.windowLog <= c2.windowLog);
+#       define ZSTD_TABLECOST(h,c) ((1<<(h)) + (1<<(c)))
+        assert(ZSTD_TABLECOST(c1.hashLog, c1.chainLog) <= ZSTD_TABLECOST(c2.hashLog, c2.chainLog));
+    }
+}
+#endif
+
 /*! ZSTD_getCParams() :
 *   @return ZSTD_compressionParameters structure for a selected compression level, `srcSize` and `dictSize`.
 *   Size values are optional, provide 0 if not known or unused */
@@ -4084,6 +4142,15 @@
     size_t const addedSize = srcSizeHint ? 0 : 500;
     U64 const rSize = srcSizeHint+dictSize ? srcSizeHint+dictSize+addedSize : (U64)-1;
     U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB);   /* intentional underflow for srcSizeHint == 0 */
+
+#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
+    static int g_monotonicTest = 1;
+    if (g_monotonicTest) {
+        ZSTD_check_compressionLevel_monotonicIncrease_memoryBudget();
+        g_monotonicTest=0;
+    }
+#endif
+
     if (compressionLevel <= 0) compressionLevel = ZSTD_CLEVEL_DEFAULT;   /* 0 == default; no negative compressionLevel yet */
     if (compressionLevel > ZSTD_MAX_CLEVEL) compressionLevel = ZSTD_MAX_CLEVEL;
     { ZSTD_compressionParameters const cp = ZSTD_defaultCParameters[tableID][compressionLevel];
diff --git a/lib/compress/zstdmt_compress.c b/lib/compress/zstdmt_compress.c
index 8a167ce..d5f08c7 100644
--- a/lib/compress/zstdmt_compress.c
+++ b/lib/compress/zstdmt_compress.c
@@ -586,7 +586,7 @@
     if (dict) {
         ZSTD_freeCDict(zcs->cdictLocal);
         zcs->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize,
-                                                    0 /* byRef */, ZSTD_dm_auto,
+                                                    0 /* byRef */, ZSTD_dm_auto,   /* note : a loadPrefix becomes an internal CDict */
                                                     params.cParams, zcs->cMem);
         zcs->cdict = zcs->cdictLocal;
         if (zcs->cdictLocal == NULL) return ERROR(memory_allocation);
diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c
index d40b3be..2a1b70e 100644
--- a/lib/decompress/huf_decompress.c
+++ b/lib/decompress/huf_decompress.c
@@ -67,6 +67,12 @@
 #define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
 
 
+/* **************************************************************
+*  Byte alignment for workSpace management
+****************************************************************/
+#define HUF_ALIGN(x, a) HUF_ALIGN_MASK((x), (a) - 1)
+#define HUF_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask))
+
 /*-***************************/
 /*  generic DTableDesc       */
 /*-***************************/
@@ -87,16 +93,28 @@
 
 typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX2;   /* single-symbol decoding */
 
-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize)
+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
 {
-    BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
-    U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];   /* large enough for values from 0 to 16 */
     U32 tableLog = 0;
     U32 nbSymbols = 0;
     size_t iSize;
     void* const dtPtr = DTable + 1;
     HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
 
+    U32* rankVal;
+    BYTE* huffWeight;
+    size_t spaceUsed32 = 0;
+
+    rankVal = (U32 *)workSpace + spaceUsed32;
+    spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
+    huffWeight = (BYTE *)((U32 *)workSpace + spaceUsed32);
+    spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+    if ((spaceUsed32 << 2) > wkspSize)
+        return ERROR(tableLog_tooLarge);
+    workSpace = (U32 *)workSpace + spaceUsed32;
+    wkspSize -= (spaceUsed32 << 2);
+
     HUF_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
     /* memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
 
@@ -135,6 +153,13 @@
     return iSize;
 }
 
+size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_readDTableX2_wksp(DTable, src, srcSize,
+                                 workSpace, sizeof(workSpace));
+}
+
 
 static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
 {
@@ -212,11 +237,13 @@
     return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
 }
 
-size_t HUF_decompress1X2_DCtx (HUF_DTable* DCtx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t const hSize = HUF_readDTableX2 (DCtx, cSrc, cSrcSize);
+    size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
@@ -224,6 +251,15 @@
     return HUF_decompress1X2_usingDTable_internal (dst, dstSize, ip, cSrcSize, DCtx);
 }
 
+
+size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
 size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
 {
     HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
@@ -335,11 +371,14 @@
 }
 
 
-size_t HUF_decompress4X2_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t const hSize = HUF_readDTableX2 (dctx, cSrc, cSrcSize);
+    size_t const hSize = HUF_readDTableX2_wksp (dctx, cSrc, cSrcSize,
+                                                workSpace, wkspSize);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
@@ -347,6 +386,13 @@
     return HUF_decompress4X2_usingDTable_internal (dst, dstSize, ip, cSrcSize, dctx);
 }
 
+
+size_t HUF_decompress4X2_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
 size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
 {
     HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
@@ -403,7 +449,8 @@
     }   }
 }
 
-typedef U32 rankVal_t[HUF_TABLELOG_MAX][HUF_TABLELOG_MAX + 1];
+typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
+typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
 
 static void HUF_fillDTableX4(HUF_DEltX4* DTable, const U32 targetLog,
                            const sortedSymbol_t* sortedList, const U32 sortedListSize,
@@ -447,20 +494,43 @@
     }
 }
 
-size_t HUF_readDTableX4 (HUF_DTable* DTable, const void* src, size_t srcSize)
+size_t HUF_readDTableX4_wksp(HUF_DTable* DTable, const void* src,
+                             size_t srcSize, void* workSpace,
+                             size_t wkspSize)
 {
-    BYTE weightList[HUF_SYMBOLVALUE_MAX + 1];
-    sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1];
-    U32 rankStats[HUF_TABLELOG_MAX + 1] = { 0 };
-    U32 rankStart0[HUF_TABLELOG_MAX + 2] = { 0 };
-    U32* const rankStart = rankStart0+1;
-    rankVal_t rankVal;
     U32 tableLog, maxW, sizeOfSort, nbSymbols;
     DTableDesc dtd = HUF_getDTableDesc(DTable);
     U32 const maxTableLog = dtd.maxTableLog;
     size_t iSize;
     void* dtPtr = DTable+1;   /* force compiler to avoid strict-aliasing */
     HUF_DEltX4* const dt = (HUF_DEltX4*)dtPtr;
+    U32 *rankStart;
+
+    rankValCol_t* rankVal;
+    U32* rankStats;
+    U32* rankStart0;
+    sortedSymbol_t* sortedSymbol;
+    BYTE* weightList;
+    size_t spaceUsed32 = 0;
+
+    rankVal = (rankValCol_t *)((U32 *)workSpace + spaceUsed32);
+    spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2;
+    rankStats = (U32 *)workSpace + spaceUsed32;
+    spaceUsed32 += HUF_TABLELOG_MAX + 1;
+    rankStart0 = (U32 *)workSpace + spaceUsed32;
+    spaceUsed32 += HUF_TABLELOG_MAX + 2;
+    sortedSymbol = (sortedSymbol_t *)workSpace + (spaceUsed32 * sizeof(U32)) / sizeof(sortedSymbol_t);
+    spaceUsed32 += HUF_ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2;
+    weightList = (BYTE *)((U32 *)workSpace + spaceUsed32);
+    spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+    if ((spaceUsed32 << 2) > wkspSize)
+        return ERROR(tableLog_tooLarge);
+    workSpace = (U32 *)workSpace + spaceUsed32;
+    wkspSize -= (spaceUsed32 << 2);
+
+    rankStart = rankStart0 + 1;
+    memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
 
     HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(HUF_DTable));   /* if compiler fails here, assertion is wrong */
     if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
@@ -527,6 +597,12 @@
     return iSize;
 }
 
+size_t HUF_readDTableX4(HUF_DTable* DTable, const void* src, size_t srcSize)
+{
+  U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+  return HUF_readDTableX4_wksp(DTable, src, srcSize,
+                               workSpace, sizeof(workSpace));
+}
 
 static U32 HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
 {
@@ -627,11 +703,14 @@
     return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
 }
 
-size_t HUF_decompress1X4_DCtx (HUF_DTable* DCtx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t const hSize = HUF_readDTableX4 (DCtx, cSrc, cSrcSize);
+    size_t const hSize = HUF_readDTableX4_wksp(DCtx, cSrc, cSrcSize,
+                                               workSpace, wkspSize);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
@@ -639,6 +718,15 @@
     return HUF_decompress1X4_usingDTable_internal (dst, dstSize, ip, cSrcSize, DCtx);
 }
 
+
+size_t HUF_decompress1X4_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X4_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
 size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
 {
     HUF_CREATE_STATIC_DTABLEX4(DTable, HUF_TABLELOG_MAX);
@@ -749,11 +837,14 @@
 }
 
 
-size_t HUF_decompress4X4_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t hSize = HUF_readDTableX4 (dctx, cSrc, cSrcSize);
+    size_t hSize = HUF_readDTableX4_wksp(dctx, cSrc, cSrcSize,
+                                         workSpace, wkspSize);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
@@ -761,6 +852,15 @@
     return HUF_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx);
 }
 
+
+size_t HUF_decompress4X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
 size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
 {
     HUF_CREATE_STATIC_DTABLEX4(DTable, HUF_TABLELOG_MAX);
@@ -862,19 +962,32 @@
     }
 }
 
-size_t HUF_decompress4X_hufOnly (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                         workSpace, sizeof(workSpace));
+}
+
+
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
+                                     size_t dstSize, const void* cSrc,
+                                     size_t cSrcSize, void* workSpace,
+                                     size_t wkspSize)
 {
     /* validation checks */
     if (dstSize == 0) return ERROR(dstSize_tooSmall);
     if ((cSrcSize >= dstSize) || (cSrcSize <= 1)) return ERROR(corruption_detected);   /* invalid */
 
     {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-        return algoNb ? HUF_decompress4X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
-                        HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
+        return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize):
+                        HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
     }
 }
 
-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                  const void* cSrc, size_t cSrcSize,
+                                  void* workSpace, size_t wkspSize)
 {
     /* validation checks */
     if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -883,7 +996,17 @@
     if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
 
     {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-        return algoNb ? HUF_decompress1X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
-                        HUF_decompress1X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
+        return algoNb ? HUF_decompress1X4_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize):
+                        HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize);
     }
 }
+
+size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
+                             const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                      workSpace, sizeof(workSpace));
+}
diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c
index 95d18d4..ff79196 100644
--- a/lib/decompress/zstd_decompress.c
+++ b/lib/decompress/zstd_decompress.c
@@ -93,6 +93,7 @@
     FSE_DTable OFTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
     FSE_DTable MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
     HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
+    U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
     U32 rep[ZSTD_REP_NUM];
 } ZSTD_entropyTables_t;
 
@@ -559,8 +560,10 @@
                                         HUF_decompress1X_usingDTable(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr) :
                                         HUF_decompress4X_usingDTable(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr) ) :
                                     ( singleStream ?
-                                        HUF_decompress1X2_DCtx(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize) :
-                                        HUF_decompress4X_hufOnly (dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize)) ))
+                                        HUF_decompress1X2_DCtx_wksp(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize,
+                                                                    dctx->entropy.workspace, sizeof(dctx->entropy.workspace)) :
+                                        HUF_decompress4X_hufOnly_wksp(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize,
+                                                                      dctx->entropy.workspace, sizeof(dctx->entropy.workspace)))))
                     return ERROR(corruption_detected);
 
                 dctx->litPtr = dctx->litBuffer;
@@ -750,6 +753,7 @@
     const BYTE* const istart = (const BYTE* const)src;
     const BYTE* const iend = istart + srcSize;
     const BYTE* ip = istart;
+    DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
 
     /* check */
     if (srcSize < MIN_SEQUENCES_SIZE) return ERROR(srcSize_wrong);
@@ -930,12 +934,18 @@
         seq.offset = offset;
     }
 
-    seq.matchLength = ML_base[mlCode] + ((mlCode>31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0);  /* <=  16 bits */
+    seq.matchLength = ML_base[mlCode]
+                    + ((mlCode>31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0);  /* <=  16 bits */
     if (MEM_32bits() && (mlBits+llBits>24)) BIT_reloadDStream(&seqState->DStream);
 
-    seq.litLength = LL_base[llCode] + ((llCode>15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0);    /* <=  16 bits */
-    if (MEM_32bits() ||
-       (totalBits > 64 - 7 - (LLFSELog+MLFSELog+OffFSELog)) ) BIT_reloadDStream(&seqState->DStream);
+    seq.litLength = LL_base[llCode]
+                  + ((llCode>15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0);    /* <=  16 bits */
+    if (  MEM_32bits()
+      || (totalBits > 64 - 7 - (LLFSELog+MLFSELog+OffFSELog)) )
+       BIT_reloadDStream(&seqState->DStream);
+
+    DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
+                (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
 
     /* ANS state update */
     FSE_updateState(&seqState->stateLL, &seqState->DStream);    /* <=  9 bits */
@@ -975,7 +985,8 @@
     /* copy Match */
     if (sequence.offset > (size_t)(oLitEnd - base)) {
         /* offset beyond prefix -> go into extDict */
-        if (sequence.offset > (size_t)(oLitEnd - vBase)) return ERROR(corruption_detected);
+        if (sequence.offset > (size_t)(oLitEnd - vBase))
+            return ERROR(corruption_detected);
         match = dictEnd + (match - base);
         if (match + sequence.matchLength <= dictEnd) {
             memmove(oLitEnd, match, sequence.matchLength);
@@ -1043,9 +1054,12 @@
     const BYTE* const vBase = (const BYTE*) (dctx->vBase);
     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
     int nbSeq;
+    DEBUGLOG(5, "ZSTD_decompressSequences");
 
     /* Build Decoding Tables */
     {   size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, seqSize);
+        DEBUGLOG(5, "ZSTD_decodeSeqHeaders: size=%u, nbSeq=%i",
+                    (U32)seqHSize, nbSeq);
         if (ZSTD_isError(seqHSize)) return seqHSize;
         ip += seqHSize;
     }
@@ -1064,11 +1078,13 @@
             nbSeq--;
             {   seq_t const sequence = ZSTD_decodeSequence(&seqState);
                 size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd);
+                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
                 if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
                 op += oneSeqSize;
         }   }
 
         /* check if reached exact end */
+        DEBUGLOG(5, "after decode loop, remaining nbSeq : %i", nbSeq);
         if (nbSeq) return ERROR(corruption_detected);
         /* save reps for next block */
         { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
@@ -1282,7 +1298,7 @@
     const BYTE* const base = (const BYTE*) (dctx->base);
     const BYTE* const vBase = (const BYTE*) (dctx->vBase);
     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
-    unsigned const windowSize = dctx->fParams.windowSize;
+    unsigned const windowSize32 = (unsigned)dctx->fParams.windowSize;
     int nbSeq;
 
     /* Build Decoding Tables */
@@ -1312,13 +1328,13 @@
 
         /* prepare in advance */
         for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && seqNb<seqAdvance; seqNb++) {
-            sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, windowSize);
+            sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, windowSize32);
         }
         if (seqNb<seqAdvance) return ERROR(corruption_detected);
 
         /* decode and decompress */
         for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && seqNb<nbSeq ; seqNb++) {
-            seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, windowSize);
+            seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, windowSize32);
             size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STOSEQ_MASK], &litPtr, litEnd, base, vBase, dictEnd);
             if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
             ZSTD_PREFETCH(sequence.match);
@@ -1355,11 +1371,13 @@
                       const void* src, size_t srcSize)
 {   /* blockType == blockCompressed */
     const BYTE* ip = (const BYTE*)src;
+    DEBUGLOG(5, "ZSTD_decompressBlock_internal");
 
     if (srcSize >= ZSTD_BLOCKSIZE_MAX) return ERROR(srcSize_wrong);
 
     /* Decode literals section */
     {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
+        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
         if (ZSTD_isError(litCSize)) return litCSize;
         ip += litCSize;
         srcSize -= litCSize;
@@ -1697,6 +1715,7 @@
  *            or an error code, which can be tested using ZSTD_isError() */
 size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
+    DEBUGLOG(5, "ZSTD_decompressContinue");
     /* Sanity check */
     if (srcSize != dctx->expected) return ERROR(srcSize_wrong);   /* unauthorized */
     if (dstCapacity) ZSTD_checkContinuity(dctx, dst);
@@ -1756,10 +1775,12 @@
         }
     case ZSTDds_decompressLastBlock:
     case ZSTDds_decompressBlock:
+        DEBUGLOG(5, "case ZSTDds_decompressBlock");
         {   size_t rSize;
             switch(dctx->bType)
             {
             case bt_compressed:
+                DEBUGLOG(5, "case bt_compressed");
                 rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize);
                 break;
             case bt_raw :
@@ -1836,7 +1857,9 @@
     dictPtr += 8;   /* skip header = magic + dictID */
 
 
-    {   size_t const hSize = HUF_readDTableX4(entropy->hufTable, dictPtr, dictEnd-dictPtr);
+    {   size_t const hSize = HUF_readDTableX4_wksp(
+            entropy->hufTable, dictPtr, dictEnd - dictPtr,
+            entropy->workspace, sizeof(entropy->workspace));
         if (HUF_isError(hSize)) return ERROR(dictionary_corrupted);
         dictPtr += hSize;
     }
@@ -1884,7 +1907,7 @@
 {
     if (dictSize < 8) return ZSTD_refDictContent(dctx, dict, dictSize);
     {   U32 const magic = MEM_readLE32(dict);
-        if (magic != ZSTD_DICT_MAGIC) {
+        if (magic != ZSTD_MAGIC_DICTIONARY) {
             return ZSTD_refDictContent(dctx, dict, dictSize);   /* pure content mode */
     }   }
     dctx->dictID = MEM_readLE32((const char*)dict + 4);
@@ -1964,7 +1987,7 @@
     ddict->entropyPresent = 0;
     if (ddict->dictSize < 8) return 0;
     {   U32 const magic = MEM_readLE32(ddict->dictContent);
-        if (magic != ZSTD_DICT_MAGIC) return 0;   /* pure content mode */
+        if (magic != ZSTD_MAGIC_DICTIONARY) return 0;   /* pure content mode */
     }
     ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + 4);
 
@@ -2083,7 +2106,7 @@
 unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
 {
     if (dictSize < 8) return 0;
-    if (MEM_readLE32(dict) != ZSTD_DICT_MAGIC) return 0;
+    if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) return 0;
     return MEM_readLE32((const char*)dict + 4);
 }
 
@@ -2143,6 +2166,11 @@
     return ZSTD_createDStream_advanced(ZSTD_defaultCMem);
 }
 
+ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize)
+{
+    return ZSTD_initStaticDCtx(workspace, workspaceSize);
+}
+
 ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem)
 {
     return ZSTD_createDCtx_advanced(customMem);
@@ -2214,15 +2242,23 @@
     return ZSTD_sizeof_DCtx(zds);
 }
 
-size_t ZSTD_estimateDStreamSize(ZSTD_frameHeader fHeader)
+size_t ZSTD_estimateDStreamSize(size_t windowSize)
 {
-    size_t const windowSize = fHeader.windowSize;
     size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
     size_t const inBuffSize = blockSize;  /* no block can be larger */
     size_t const outBuffSize = windowSize + blockSize + (WILDCOPY_OVERLENGTH * 2);
     return sizeof(ZSTD_DStream) + ZSTD_estimateDCtxSize() + inBuffSize + outBuffSize;
 }
 
+ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize)
+{
+    ZSTD_frameHeader fh;
+    size_t const err = ZSTD_getFrameHeader(&fh, src, srcSize);
+    if (ZSTD_isError(err)) return err;
+    if (err>0) return ERROR(srcSize_wrong);
+    return ZSTD_estimateDStreamSize(fh.windowSize);
+}
+
 
 /* *****   Decompression   ***** */
 
@@ -2310,7 +2346,7 @@
             }   }
 
             /* Consume header (see ZSTDds_decodeFrameHeader) */
-            DEBUGLOG(5, "Consume header");
+            DEBUGLOG(4, "Consume header");
             CHECK_F(ZSTD_decompressBegin_usingDDict(zds, zds->ddict));
 
             if ((MEM_readLE32(zds->headerBuffer) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
@@ -2323,7 +2359,7 @@
             }
 
             /* control buffer memory usage */
-            DEBUGLOG(5, "Control max buffer memory usage");
+            DEBUGLOG(4, "Control max buffer memory usage");
             zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
             if (zds->fParams.windowSize > zds->maxWindowSize) return ERROR(frameParameter_windowTooLarge);
 
@@ -2333,12 +2369,12 @@
                 zds->blockSize = blockSize;
                 if ((zds->inBuffSize < blockSize) || (zds->outBuffSize < neededOutSize)) {
                     size_t const bufferSize = blockSize + neededOutSize;
-                    DEBUGLOG(5, "inBuff  : from %u to %u",
+                    DEBUGLOG(4, "inBuff  : from %u to %u",
                                 (U32)zds->inBuffSize, (U32)blockSize);
-                    DEBUGLOG(5, "outBuff : from %u to %u",
+                    DEBUGLOG(4, "outBuff : from %u to %u",
                                 (U32)zds->outBuffSize, (U32)neededOutSize);
                     if (zds->staticSize) {  /* static DCtx */
-                        DEBUGLOG(5, "staticSize : %u", (U32)zds->staticSize);
+                        DEBUGLOG(4, "staticSize : %u", (U32)zds->staticSize);
                         assert(zds->staticSize >= sizeof(ZSTD_DCtx));  /* controlled at init */
                         if (bufferSize > zds->staticSize - sizeof(ZSTD_DCtx))
                             return ERROR(memory_allocation);
diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c
index 1863c8f..06c1b9f 100644
--- a/lib/dictBuilder/cover.c
+++ b/lib/dictBuilder/cover.c
@@ -398,7 +398,8 @@
  */
 static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
                                            COVER_map_t *activeDmers, U32 begin,
-                                           U32 end, COVER_params_t parameters) {
+                                           U32 end,
+                                           ZDICT_cover_params_t parameters) {
   /* Constants */
   const U32 k = parameters.k;
   const U32 d = parameters.d;
@@ -478,7 +479,7 @@
  * Check the validity of the parameters.
  * Returns non-zero if the parameters are valid and 0 otherwise.
  */
-static int COVER_checkParameters(COVER_params_t parameters) {
+static int COVER_checkParameters(ZDICT_cover_params_t parameters) {
   /* k and d are required parameters */
   if (parameters.d == 0 || parameters.k == 0) {
     return 0;
@@ -600,7 +601,7 @@
 static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
                                     COVER_map_t *activeDmers, void *dictBuffer,
                                     size_t dictBufferCapacity,
-                                    COVER_params_t parameters) {
+                                    ZDICT_cover_params_t parameters) {
   BYTE *const dict = (BYTE *)dictBuffer;
   size_t tail = dictBufferCapacity;
   /* Divide the data up into epochs of equal size.
@@ -639,22 +640,10 @@
   return tail;
 }
 
-/**
- * Translate from COVER_params_t to ZDICT_params_t required for finalizing the
- * dictionary.
- */
-static ZDICT_params_t COVER_translateParams(COVER_params_t parameters) {
-  ZDICT_params_t zdictParams;
-  memset(&zdictParams, 0, sizeof(zdictParams));
-  zdictParams.notificationLevel = 1;
-  zdictParams.dictID = parameters.dictID;
-  zdictParams.compressionLevel = parameters.compressionLevel;
-  return zdictParams;
-}
-
-ZDICTLIB_API size_t COVER_trainFromBuffer(
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
     void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
-    const size_t *samplesSizes, unsigned nbSamples, COVER_params_t parameters) {
+    const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_cover_params_t parameters) {
   BYTE *const dict = (BYTE *)dictBuffer;
   COVER_ctx_t ctx;
   COVER_map_t activeDmers;
@@ -673,7 +662,7 @@
     return ERROR(dstSize_tooSmall);
   }
   /* Initialize global data */
-  g_displayLevel = parameters.notificationLevel;
+  g_displayLevel = parameters.zParams.notificationLevel;
   /* Initialize context and activeDmers */
   if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
                       parameters.d)) {
@@ -690,10 +679,9 @@
     const size_t tail =
         COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,
                               dictBufferCapacity, parameters);
-    ZDICT_params_t zdictParams = COVER_translateParams(parameters);
     const size_t dictionarySize = ZDICT_finalizeDictionary(
         dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
-        samplesBuffer, samplesSizes, nbSamples, zdictParams);
+        samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
     if (!ZSTD_isError(dictionarySize)) {
       DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
                    (U32)dictionarySize);
@@ -718,7 +706,7 @@
   size_t liveJobs;
   void *dict;
   size_t dictSize;
-  COVER_params_t parameters;
+  ZDICT_cover_params_t parameters;
   size_t compressedSize;
 } COVER_best_t;
 
@@ -786,7 +774,7 @@
  * If this dictionary is the best so far save it and its parameters.
  */
 static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
-                              COVER_params_t parameters, void *dict,
+                              ZDICT_cover_params_t parameters, void *dict,
                               size_t dictSize) {
   if (!best) {
     return;
@@ -830,7 +818,7 @@
   const COVER_ctx_t *ctx;
   COVER_best_t *best;
   size_t dictBufferCapacity;
-  COVER_params_t parameters;
+  ZDICT_cover_params_t parameters;
 } COVER_tryParameters_data_t;
 
 /**
@@ -842,7 +830,7 @@
   /* Save parameters as local variables */
   COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque;
   const COVER_ctx_t *const ctx = data->ctx;
-  const COVER_params_t parameters = data->parameters;
+  const ZDICT_cover_params_t parameters = data->parameters;
   size_t dictBufferCapacity = data->dictBufferCapacity;
   size_t totalCompressedSize = ERROR(GENERIC);
   /* Allocate space for hash table, dict, and freqs */
@@ -863,10 +851,10 @@
   {
     const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
                                               dictBufferCapacity, parameters);
-    const ZDICT_params_t zdictParams = COVER_translateParams(parameters);
     dictBufferCapacity = ZDICT_finalizeDictionary(
         dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
-        ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples, zdictParams);
+        ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples,
+        parameters.zParams);
     if (ZDICT_isError(dictBufferCapacity)) {
       DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
       goto _cleanup;
@@ -892,8 +880,8 @@
     }
     /* Create the cctx and cdict */
     cctx = ZSTD_createCCtx();
-    cdict =
-        ZSTD_createCDict(dict, dictBufferCapacity, parameters.compressionLevel);
+    cdict = ZSTD_createCDict(dict, dictBufferCapacity,
+                             parameters.zParams.compressionLevel);
     if (!dst || !cctx || !cdict) {
       goto _compressCleanup;
     }
@@ -930,12 +918,10 @@
   }
 }
 
-ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void *dictBuffer,
-                                                  size_t dictBufferCapacity,
-                                                  const void *samplesBuffer,
-                                                  const size_t *samplesSizes,
-                                                  unsigned nbSamples,
-                                                  COVER_params_t *parameters) {
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_cover_params_t *parameters) {
   /* constants */
   const unsigned nbThreads = parameters->nbThreads;
   const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
@@ -947,7 +933,7 @@
   const unsigned kIterations =
       (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
   /* Local variables */
-  const int displayLevel = parameters->notificationLevel;
+  const int displayLevel = parameters->zParams.notificationLevel;
   unsigned iteration = 1;
   unsigned d;
   unsigned k;
@@ -976,7 +962,7 @@
   /* Initialization */
   COVER_best_init(&best);
   /* Turn down global display level to clean up display at level 2 and below */
-  g_displayLevel = parameters->notificationLevel - 1;
+  g_displayLevel = parameters->zParams.notificationLevel - 1;
   /* Loop through d first because each new value needs a new context */
   LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
                     kIterations);
diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c
index 943ddde..742586e 100644
--- a/lib/dictBuilder/zdict.c
+++ b/lib/dictBuilder/zdict.c
@@ -94,7 +94,7 @@
 unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
 {
     if (dictSize < 8) return 0;
-    if (MEM_readLE32(dictBuffer) != ZSTD_DICT_MAGIC) return 0;
+    if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;
     return MEM_readLE32((const char*)dictBuffer + 4);
 }
 
@@ -487,7 +487,7 @@
 }
 
 
-static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
+static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
                             const void* const buffer, size_t bufferSize,   /* buffer must end with noisy guard band */
                             const size_t* fileSizes, unsigned nbFiles,
                             U32 minRatio, U32 notificationLevel)
@@ -634,17 +634,6 @@
     }   }   }
 }
 
-/*
-static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles)
-{
-    unsigned u;
-    size_t max=0;
-    for (u=0; u<nbFiles; u++)
-        if (max < fileSizes[u]) max = fileSizes[u];
-    return max;
-}
-*/
-
 static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
 {
     size_t total=0;
@@ -865,7 +854,7 @@
     if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
 
     /* dictionary header */
-    MEM_writeLE32(header, ZSTD_DICT_MAGIC);
+    MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
     {   U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
         U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
         U32 const dictID = params.dictID ? params.dictID : compliantID;
@@ -917,7 +906,7 @@
     }
 
     /* add dictionary header (after entropy tables) */
-    MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
+    MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
     {   U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
         U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
         U32 const dictID = params.dictID ? params.dictID : compliantID;
@@ -930,14 +919,14 @@
 }
 
 
-/*! ZDICT_trainFromBuffer_unsafe() :
+/*! ZDICT_trainFromBuffer_unsafe_legacy() :
 *   Warning : `samplesBuffer` must be followed by noisy guard band.
 *   @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
 */
-size_t ZDICT_trainFromBuffer_unsafe(
+size_t ZDICT_trainFromBuffer_unsafe_legacy(
                             void* dictBuffer, size_t maxDictSize,
                             const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                            ZDICT_params_t params)
+                            ZDICT_legacy_params_t params)
 {
     U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
     dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
@@ -946,7 +935,7 @@
     size_t const targetDictSize = maxDictSize;
     size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
     size_t dictSize = 0;
-    U32 const notificationLevel = params.notificationLevel;
+    U32 const notificationLevel = params.zParams.notificationLevel;
 
     /* checks */
     if (!dictList) return ERROR(memory_allocation);
@@ -957,13 +946,13 @@
     ZDICT_initDictItem(dictList);
 
     /* build dictionary */
-    ZDICT_trainBuffer(dictList, dictListSize,
-                    samplesBuffer, samplesBuffSize,
-                    samplesSizes, nbSamples,
-                    minRep, notificationLevel);
+    ZDICT_trainBuffer_legacy(dictList, dictListSize,
+                       samplesBuffer, samplesBuffSize,
+                       samplesSizes, nbSamples,
+                       minRep, notificationLevel);
 
     /* display best matches */
-    if (params.notificationLevel>= 3) {
+    if (params.zParams.notificationLevel>= 3) {
         U32 const nb = MIN(25, dictList[0].pos);
         U32 const dictContentSize = ZDICT_dictSize(dictList);
         U32 u;
@@ -1026,7 +1015,7 @@
 
         dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
                                                              samplesBuffer, samplesSizes, nbSamples,
-                                                             params);
+                                                             params.zParams);
     }
 
     /* clean up */
@@ -1037,9 +1026,9 @@
 
 /* issue : samplesBuffer need to be followed by a noisy guard band.
 *  work around : duplicate the buffer, and add the noise */
-size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
-                                      const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                                      ZDICT_params_t params)
+size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
+                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                              ZDICT_legacy_params_t params)
 {
     size_t result;
     void* newBuff;
@@ -1052,10 +1041,9 @@
     memcpy(newBuff, samplesBuffer, sBuffSize);
     ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH);   /* guard band, for end of buffer condition */
 
-    result = ZDICT_trainFromBuffer_unsafe(
-                                        dictBuffer, dictBufferCapacity,
-                                        newBuff, samplesSizes, nbSamples,
-                                        params);
+    result =
+        ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
+                                            samplesSizes, nbSamples, params);
     free(newBuff);
     return result;
 }
@@ -1064,11 +1052,13 @@
 size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
 {
-    ZDICT_params_t params;
+    ZDICT_cover_params_t params;
     memset(&params, 0, sizeof(params));
-    return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity,
-                                          samplesBuffer, samplesSizes, nbSamples,
-                                          params);
+    params.d = 8;
+    params.steps = 4;
+    return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
+                                               samplesBuffer, samplesSizes,
+                                               nbSamples, &params);
 }
 
 size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
diff --git a/lib/dictBuilder/zdict.h b/lib/dictBuilder/zdict.h
index 5ef2a3f..7bfbb35 100644
--- a/lib/dictBuilder/zdict.h
+++ b/lib/dictBuilder/zdict.h
@@ -36,18 +36,20 @@
 #endif
 
 
-/*! ZDICT_trainFromBuffer() :
-    Train a dictionary from an array of samples.
-    Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
-    supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
-    The resulting dictionary will be saved into `dictBuffer`.
-    @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
-              or an error code, which can be tested with ZDICT_isError().
-    Tips : In general, a reasonable dictionary has a size of ~ 100 KB.
-           It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`.
-           In general, it's recommended to provide a few thousands samples, but this can vary a lot.
-           It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
-*/
+/*! ZDICT_trainFromBuffer():
+ * Train a dictionary from an array of samples.
+ * Uses ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4.
+ * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ * The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ * Note: ZDICT_trainFromBuffer() requires about 9 bytes of memory for each input byte.
+ * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, but this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ */
 ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
                        const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
 
@@ -69,94 +71,78 @@
  * ==================================================================================== */
 
 typedef struct {
-    unsigned selectivityLevel;   /* 0 means default; larger => select more => larger dictionary */
     int      compressionLevel;   /* 0 means default; target a specific zstd compression level */
     unsigned notificationLevel;  /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
     unsigned dictID;             /* 0 means auto mode (32-bits random value); other : force dictID value */
-    unsigned reserved[2];        /* reserved space for future parameters */
 } ZDICT_params_t;
 
-
-/*! ZDICT_trainFromBuffer_advanced() :
-    Same as ZDICT_trainFromBuffer() with control over more parameters.
-    `parameters` is optional and can be provided with values set to 0 to mean "default".
-    @return : size of dictionary stored into `dictBuffer` (<= `dictBufferSize`),
-              or an error code, which can be tested by ZDICT_isError().
-    note : ZDICT_trainFromBuffer_advanced() will send notifications into stderr if instructed to, using notificationLevel>0.
-*/
-ZDICTLIB_API size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
-                                const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                                ZDICT_params_t parameters);
-
-/*! COVER_params_t :
-    For all values 0 means default.
-    k and d are the only required parameters.
-*/
+/*! ZDICT_cover_params_t:
+ *  For all values 0 means default.
+ *  k and d are the only required parameters.
+ */
 typedef struct {
     unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
     unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
     unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */
-
     unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
-    unsigned notificationLevel;  /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
-    unsigned dictID;             /* 0 means auto mode (32-bits random value); other : force dictID value */
-    int      compressionLevel;   /* 0 means default; target a specific zstd compression level */
-} COVER_params_t;
+    ZDICT_params_t zParams;
+} ZDICT_cover_params_t;
 
 
-/*! COVER_trainFromBuffer() :
-    Train a dictionary from an array of samples using the COVER algorithm.
-    Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
-    supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
-    The resulting dictionary will be saved into `dictBuffer`.
-    @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
-              or an error code, which can be tested with ZDICT_isError().
-    Note : COVER_trainFromBuffer() requires about 9 bytes of memory for each input byte.
-    Tips : In general, a reasonable dictionary has a size of ~ 100 KB.
-           It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`.
-           In general, it's recommended to provide a few thousands samples, but this can vary a lot.
-           It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
-*/
-ZDICTLIB_API size_t COVER_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
-                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                              COVER_params_t parameters);
+/*! ZDICT_trainFromBuffer_cover():
+ * Train a dictionary from an array of samples using the COVER algorithm.
+ * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ * The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ * Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte.
+ * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, but this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_cover_params_t parameters);
 
-/*! COVER_optimizeTrainFromBuffer() :
-    The same requirements as above hold for all the parameters except `parameters`.
-    This function tries many parameter combinations and picks the best parameters.
-    `*parameters` is filled with the best parameters found, and the dictionary
-    constructed with those parameters is stored in `dictBuffer`.
+/*! ZDICT_optimizeTrainFromBuffer_cover():
+ * The same requirements as above hold for all the parameters except `parameters`.
+ * This function tries many parameter combinations and picks the best parameters.
+ * `*parameters` is filled with the best parameters found, and the dictionary
+ * constructed with those parameters is stored in `dictBuffer`.
+ *
+ * All of the parameters d, k, steps are optional.
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
+ * if steps is zero it defaults to its default value.
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048].
+ *
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ *           On success `*parameters` contains the parameters selected.
+ * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread.
+ */
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_cover_params_t *parameters);
 
-    All of the parameters d, k, steps are optional.
-    If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
-    if steps is zero it defaults to its default value.
-    If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048].
-
-    @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
-              or an error code, which can be tested with ZDICT_isError().
-              On success `*parameters` contains the parameters selected.
-    Note : COVER_optimizeTrainFromBuffer() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread.
-*/
-ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
-                                     const void* samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
-                                     COVER_params_t *parameters);
-
-/*! ZDICT_finalizeDictionary() :
-
-    Given a custom content as a basis for dictionary, and a set of samples,
-    finalize dictionary by adding headers and statistics.
-
-    Samples must be stored concatenated in a flat buffer `samplesBuffer`,
-    supplied with an array of sizes `samplesSizes`, providing the size of each sample in order.
-
-    dictContentSize must be >= ZDICT_CONTENTSIZE_MIN bytes.
-    maxDictSize must be >= dictContentSize, and must be >= ZDICT_DICTSIZE_MIN bytes.
-
-    @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`),
-              or an error code, which can be tested by ZDICT_isError().
-    note : ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0.
-    note 2 : dictBuffer and dictContent can overlap
-*/
+/*! ZDICT_finalizeDictionary():
+ * Given a custom content as a basis for dictionary, and a set of samples,
+ * finalize dictionary by adding headers and statistics.
+ *
+ * Samples must be stored concatenated in a flat buffer `samplesBuffer`,
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample in order.
+ *
+ * dictContentSize must be >= ZDICT_CONTENTSIZE_MIN bytes.
+ * maxDictSize must be >= dictContentSize, and must be >= ZDICT_DICTSIZE_MIN bytes.
+ *
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`),
+ *           or an error code, which can be tested by ZDICT_isError().
+ * Note: ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0.
+ * Note 2: dictBuffer and dictContent can overlap
+ */
 #define ZDICT_CONTENTSIZE_MIN 128
 #define ZDICT_DICTSIZE_MIN    256
 ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
@@ -164,7 +150,28 @@
                                 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
                                 ZDICT_params_t parameters);
 
+typedef struct {
+    unsigned selectivityLevel;   /* 0 means default; larger => select more => larger dictionary */
+    ZDICT_params_t zParams;
+} ZDICT_legacy_params_t;
 
+/*! ZDICT_trainFromBuffer_legacy():
+ * Train a dictionary from an array of samples.
+ * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ * The resulting dictionary will be saved into `dictBuffer`.
+ * `parameters` is optional and can be provided with values set to 0 to mean "default".
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, but this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ * Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0.
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples, ZDICT_legacy_params_t parameters);
 
 /* Deprecation warnings */
 /* It is generally possible to disable deprecation warnings from compiler,
diff --git a/lib/zstd.h b/lib/zstd.h
index 8cf9ba7..58e9a56 100644
--- a/lib/zstd.h
+++ b/lib/zstd.h
@@ -68,7 +68,7 @@
 #define ZSTD_QUOTE(str) #str
 #define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str)
 #define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION)
-ZSTDLIB_API const char* ZSTD_versionString(void);   /* >= v1.3.0 */
+ZSTDLIB_API const char* ZSTD_versionString(void);   /* v1.3.0 */
 
 
 /***************************************
@@ -92,27 +92,41 @@
 ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
                               const void* src, size_t compressedSize);
 
-/*! ZSTD_getDecompressedSize() :
- *  NOTE: This function is planned to be obsolete, in favor of ZSTD_getFrameContentSize().
- *  ZSTD_getFrameContentSize() works the same way,
- *  returning the decompressed size of a single frame,
- *  but distinguishes empty frames from frames with an unknown size, or errors.
- *
- *  'src' is the start of a zstd compressed frame.
- *  @return : content size to be decompressed, as a 64-bits value _if known_, 0 otherwise.
- *   note 1 : decompressed size is an optional field, it may not be present, typically in streaming mode.
- *            When `return==0`, data to decompress could be any size.
+/*! ZSTD_getFrameContentSize() : v1.3.0
+ *  `src` should point to the start of a ZSTD encoded frame.
+ *  `srcSize` must be at least as large as the frame header.
+ *            hint : any size >= `ZSTD_frameHeaderSize_max` is large enough.
+ *  @return : - decompressed size of the frame in `src`, if known
+ *            - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+ *            - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small)
+ *   note 1 : a 0 return value means the frame is valid but "empty".
+ *   note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode.
+ *            When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
  *            In which case, it's necessary to use streaming mode to decompress data.
- *            Optionally, application can use ZSTD_decompress() while relying on implied limits.
- *            (For example, data may be necessarily cut into blocks <= 16 KB).
- *   note 2 : decompressed size is always present when compression is done with ZSTD_compress()
- *   note 3 : decompressed size can be very large (64-bits value),
+ *            Optionally, application can rely on some implicit limit,
+ *            as ZSTD_decompress() only needs an upper bound of decompressed size.
+ *            (For example, data could be necessarily cut into blocks <= 16 KB).
+ *   note 3 : decompressed size is always present when compression is done with ZSTD_compress()
+ *   note 4 : decompressed size can be very large (64-bits value),
  *            potentially larger than what local system can handle as a single memory segment.
  *            In which case, it's necessary to use streaming mode to decompress data.
- *   note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified.
- *            Always ensure result fits within application's authorized limits.
+ *   note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ *            Always ensure return value fits within application's authorized limits.
  *            Each application can set its own limits.
- *   note 5 : when `return==0`, if precise failure cause is needed, use ZSTD_getFrameHeader() to know more. */
+ *   note 6 : This function replaces ZSTD_getDecompressedSize() */
+#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
+#define ZSTD_CONTENTSIZE_ERROR   (0ULL - 2)
+ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
+
+/*! ZSTD_getDecompressedSize() :
+ *  NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize().
+ *  Both functions work the same way,
+ *  but ZSTD_getDecompressedSize() blends
+ *  "empty", "unknown" and "error" results in the same return value (0),
+ *  while ZSTD_getFrameContentSize() distinguishes them.
+ *
+ *  'src' is the start of a zstd compressed frame.
+ *  @return : content size to be decompressed, as a 64-bits value _if known and not empty_, 0 otherwise. */
 ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
 
 
@@ -291,7 +305,7 @@
 * *******************************************************************/
 
 typedef ZSTD_CCtx ZSTD_CStream;  /**< CCtx and CStream are now effectively same object (>= v1.3.0) */
-                                 /* But continue to distinguish them for compatibility with versions <= v1.2.0 */
+                                 /* Continue to distinguish them for compatibility with versions <= v1.2.0 */
 /*===== ZSTD_CStream management functions =====*/
 ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void);
 ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs);
@@ -330,7 +344,7 @@
 * *******************************************************************************/
 
 typedef ZSTD_DCtx ZSTD_DStream;  /**< DCtx and DStream are now effectively same object (>= v1.3.0) */
-                                 /* But continue to distinguish them for compatibility with versions <= v1.2.0 */
+                                 /* Continue to distinguish them for compatibility with versions <= v1.2.0 */
 /*===== ZSTD_DStream management functions =====*/
 ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void);
 ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);
@@ -349,8 +363,8 @@
 /****************************************************************************************
  * START OF ADVANCED AND EXPERIMENTAL FUNCTIONS
  * The definitions in this section are considered experimental.
- * They should never be used with a dynamic library, as they may change in the future.
- * They are provided for advanced usages.
+ * They should never be used with a dynamic library, as prototypes may change in the future.
+ * They are provided for advanced scenarios.
  * Use them only in association with static linking.
  * ***************************************************************************************/
 
@@ -360,6 +374,7 @@
 /* --- Constants ---*/
 #define ZSTD_MAGICNUMBER            0xFD2FB528   /* >= v0.8.0 */
 #define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50U
+#define ZSTD_MAGIC_DICTIONARY       0xEC30A437   /* v0.7+ */
 
 #define ZSTD_WINDOWLOG_MAX_32  27
 #define ZSTD_WINDOWLOG_MAX_64  27
@@ -379,9 +394,9 @@
 
 #define ZSTD_FRAMEHEADERSIZE_MAX 18    /* for static allocation */
 #define ZSTD_FRAMEHEADERSIZE_MIN  6
-static const size_t ZSTD_frameHeaderSize_prefix = 5;
-static const size_t ZSTD_frameHeaderSize_min = ZSTD_FRAMEHEADERSIZE_MIN;
+static const size_t ZSTD_frameHeaderSize_prefix = 5;  /* minimum input size to know frame header size */
 static const size_t ZSTD_frameHeaderSize_max = ZSTD_FRAMEHEADERSIZE_MAX;
+static const size_t ZSTD_frameHeaderSize_min = ZSTD_FRAMEHEADERSIZE_MIN;
 static const size_t ZSTD_skippableHeaderSize = 8;  /* magic number + skippable frame length */
 
 
@@ -412,7 +427,7 @@
 
 typedef struct {
     unsigned long long frameContentSize;
-    unsigned windowSize;
+    size_t windowSize;
     unsigned dictID;
     unsigned checksumFlag;
 } ZSTD_frameHeader;
@@ -432,26 +447,15 @@
 /*! ZSTD_findFrameCompressedSize() :
  *  `src` should point to the start of a ZSTD encoded frame or skippable frame
  *  `srcSize` must be at least as large as the frame
- *  @return : the compressed size of the frame pointed to by `src`,
+ *  @return : the compressed size of the first frame starting at `src`,
  *            suitable to pass to `ZSTD_decompress` or similar,
- *            or an error code if given invalid input. */
+ *            or an error code if input is invalid */
 ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize);
 
-/*! ZSTD_getFrameContentSize() :
- *  `src` should point to the start of a ZSTD encoded frame.
- *  `srcSize` must be at least as large as the frame header.
- *       A value >= `ZSTD_frameHeaderSize_max` is guaranteed to be large enough.
- *  @return : - decompressed size of the frame pointed to be `src` if known
- *            - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
- *            - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */
-#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
-#define ZSTD_CONTENTSIZE_ERROR   (0ULL - 2)
-ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
-
 /*! ZSTD_findDecompressedSize() :
  *  `src` should point the start of a series of ZSTD encoded and/or skippable frames
  *  `srcSize` must be the _exact_ size of this series
- *       (i.e. there should be a frame boundary exactly `srcSize` bytes after `src`)
+ *       (i.e. there should be a frame boundary exactly at `srcSize` bytes after `src`)
  *  @return : - decompressed size of all data in all successive frames
  *            - if the decompressed size cannot be determined: ZSTD_CONTENTSIZE_UNKNOWN
  *            - if an error occurred: ZSTD_CONTENTSIZE_ERROR
@@ -459,8 +463,6 @@
  *   note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode.
  *            When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
  *            In which case, it's necessary to use streaming mode to decompress data.
- *            Optionally, application can still use ZSTD_decompress() while relying on implied limits.
- *            (For example, data may be necessarily cut into blocks <= 16 KB).
  *   note 2 : decompressed size is always present when compression is done with ZSTD_compress()
  *   note 3 : decompressed size can be very large (64-bits value),
  *            potentially larger than what local system can handle as a single memory segment.
@@ -469,7 +471,7 @@
  *            Always ensure result fits within application's authorized limits.
  *            Each application can set its own limits.
  *   note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to
- *            read each contained frame header.  This is efficient as most of the data is skipped,
+ *            read each contained frame header.  This is fast as most of the data is skipped,
  *            however it does mean that all frame data must be present and valid. */
 ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize);
 
@@ -477,7 +479,8 @@
 *   `src` should point to the start of a ZSTD frame
 *   `srcSize` must be >= ZSTD_frameHeaderSize_prefix.
 *   @return : size of the Frame Header */
-size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+
 
 /***************************************
 *  Context memory usage
@@ -495,23 +498,39 @@
 
 /*! ZSTD_estimate*() :
  *  These functions make it possible to estimate memory usage
- *  of a future target object, before its allocation,
- *  given a set of parameters, which vary depending on target object.
- *  The objective is to guide decision before allocation.
+ *  of a future {D,C}Ctx, before its creation.
+ *  ZSTD_estimateCCtxSize() will provide a budget large enough for any compression level up to selected one.
+ *  It will also consider src size to be arbitrarily "large", which is worst case.
+ *  If srcSize is known to always be small, ZSTD_estimateCCtxSize_advanced() can provide a tighter estimation.
+ *  ZSTD_estimateCCtxSize_advanced() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
  *  Note : CCtx estimation is only correct for single-threaded compression */
-ZSTDLIB_API size_t ZSTD_estimateCCtxSize(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize_advanced(ZSTD_compressionParameters cParams);
 ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void);
 
 /*! ZSTD_estimate?StreamSize() :
+ *  ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
+ *  It will also consider src size to be arbitrarily "large", which is worst case.
+ *  If srcSize is known to always be small, ZSTD_estimateCStreamSize_advanced() can provide a tighter estimation.
+ *  ZSTD_estimateCStreamSize_advanced() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
+ *  Note : CStream estimation is only correct for single-threaded compression.
+ *  ZSTD_DStream memory budget depends on window Size.
+ *  This information can be passed manually, using ZSTD_estimateDStreamSize,
+ *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
  *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
- *         an internal ?Dict will be created, which size is not estimated here.
+ *         an internal ?Dict will be created, which additional size is not estimated here.
  *         In this case, get total size by adding ZSTD_estimate?DictSize */
-ZSTDLIB_API size_t ZSTD_estimateCStreamSize(ZSTD_compressionParameters cParams);
-ZSTDLIB_API size_t ZSTD_estimateDStreamSize(ZSTD_frameHeader fHeader);
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize_advanced(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
+ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
 
 /*! ZSTD_estimate?DictSize() :
+ *  ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict().
+ *  ZSTD_estimateCStreamSize_advanced() makes it possible to control precisely compression parameters, like ZSTD_createCDict_advanced().
  *  Note : dictionary created "byReference" are smaller */
-ZSTDLIB_API size_t ZSTD_estimateCDictSize(ZSTD_compressionParameters cParams, size_t dictSize, unsigned byReference);
+ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, unsigned byReference);
 ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, unsigned byReference);
 
 
@@ -558,12 +577,16 @@
 ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel);
 
 
-typedef enum { ZSTD_dm_auto=0, ZSTD_dm_rawContent, ZSTD_dm_fullDict } ZSTD_dictMode_e;
+typedef enum { ZSTD_dm_auto=0,        /* dictionary is "full" if it starts with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */
+               ZSTD_dm_rawContent,    /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */
+               ZSTD_dm_fullDict       /* refuses to load a dictionary if it does not respect Zstandard's specification */
+} ZSTD_dictMode_e;
 /*! ZSTD_createCDict_advanced() :
  *  Create a ZSTD_CDict using external alloc and free, and customized compression parameters */
 ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize,
                                                   unsigned byReference, ZSTD_dictMode_e dictMode,
-                                                  ZSTD_compressionParameters cParams, ZSTD_customMem customMem);
+                                                  ZSTD_compressionParameters cParams,
+                                                  ZSTD_customMem customMem);
 
 /*! ZSTD_initStaticCDict_advanced() :
  *  Generate a digested dictionary in provided memory area.
@@ -707,6 +730,7 @@
 
 /*=====   Advanced Streaming compression functions  =====*/
 ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticCCtx() */
 ZSTDLIB_API size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize);   /**< pledgedSrcSize must be correct, a size of 0 means unknown.  for a frame size of 0 use initCStream_advanced */
 ZSTDLIB_API size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); /**< creates of an internal CDict (incompatible with static CCtx), except if dict == NULL or dictSize < 8, in which case no dict is used. */
 ZSTDLIB_API size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, const void* dict, size_t dictSize,
@@ -727,6 +751,7 @@
 /*=====   Advanced Streaming decompression functions  =====*/
 typedef enum { DStream_p_maxWindowSize } ZSTD_DStreamParameter_e;
 ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticDCtx() */
 ZSTDLIB_API size_t ZSTD_setDStreamParameter(ZSTD_DStream* zds, ZSTD_DStreamParameter_e paramType, unsigned paramValue);
 ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); /**< note: a dict will not be used if dict == NULL or dictSize < 8 */
 ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);  /**< note : ddict will just be referenced, and must outlive decompression session */
@@ -793,16 +818,18 @@
   A ZSTD_DCtx object can be re-used multiple times.
 
   First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
-  It fills a ZSTD_frameParams structure which provide important information to correctly decode the frame,
-  such as the minimum rolling buffer size to allocate to decompress data (`windowSize`),
-  and the dictionary ID used.
+  It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
+  such as minimum rolling buffer size to allocate to decompress data (`windowSize`),
+  and the dictionary ID in use.
   (Note : content size is optional, it may not be present. 0 means : content size unknown).
   Note that these values could be wrong, either because of data malformation, or because an attacker is spoofing deliberate false information.
   As a consequence, check that values remain within valid application range, especially `windowSize`, before allocation.
-  Each application can set its own limit, depending on local restrictions. For extended interoperability, it is recommended to support at least 8 MB.
-  Frame parameters are extracted from the beginning of the compressed frame.
-  Data fragment must be large enough to ensure successful decoding, typically `ZSTD_frameHeaderSize_max` bytes.
-  @result : 0 : successful decoding, the `ZSTD_frameParams` structure is correctly filled.
+  Each application can set its own limit, depending on local restrictions.
+  For extended interoperability, it is recommended to support windowSize of at least 8 MB.
+  Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+  Data fragment must be large enough to ensure successful decoding.
+  `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
            >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
            errorCode, which can be tested using ZSTD_isError().
 
@@ -847,7 +874,7 @@
 */
 
 /*=====   Buffer-less streaming decompression functions  =====*/
-ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /**< doesn't consume input, see details below */
+ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /**< doesn't consume input */
 ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
 ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
 ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
@@ -924,26 +951,17 @@
                               * The higher the value of selected strategy, the more complex it is,
                               * resulting in stronger and slower compression.
                               * Special: value 0 means "do not change strategy". */
-#if 0
-    ZSTD_p_windowSize,       /* Maximum allowed back-reference distance.
-                              * Can be set to a more precise value than windowLog.
-                              * Will be transparently reduced to closest possible inferior value
-                              * (see Zstandard compression format) */
-                             /* Not ready yet ! */
-#endif
 
     /* frame parameters */
     ZSTD_p_contentSizeFlag=200, /* Content size is written into frame header _whenever known_ (default:1) */
     ZSTD_p_checksumFlag,     /* A 32-bits checksum of content is written at end of frame (default:0) */
     ZSTD_p_dictIDFlag,       /* When applicable, dictID of dictionary is provided in frame header (default:1) */
 
-    /* dictionary parameters */
-    ZSTD_p_refDictContent=300, /* Content of dictionary content will be referenced, instead of copied (default:0).
-                              * This avoids duplicating dictionary content.
-                              * But it also requires that dictionary buffer outlives its users */
-                             /* Not ready yet ! <=================================== */
-    ZSTD_p_dictMode,         /* Select how dictionary must be interpreted. Value must be from type ZSTD_dictMode_e.
+    /* dictionary parameters (must be set before ZSTD_CCtx_loadDictionary) */
+    ZSTD_p_dictMode=300,     /* Select how dictionary content must be interpreted. Value must be from type ZSTD_dictMode_e.
                               * default : 0==auto : dictionary will be "full" if it respects specification, otherwise it will be "rawContent" */
+    ZSTD_p_refDictContent,   /* Dictionary content will be referenced, instead of copied (default:0==byCopy).
+                              * It requires that dictionary buffer outlives its users */
 
     /* multi-threading parameters */
     ZSTD_p_nbThreads=400,    /* Select how many threads a compression job can spawn (default:1)
@@ -958,9 +976,9 @@
                               * 0 => no overlap, 6(default) => use 1/8th of windowSize, >=9 => use full windowSize */
 
     /* advanced parameters - may not remain available after API update */
-    ZSTD_p_forceMaxWindow=1100, /* Force back-references to remain < windowSize,
-                              * even when referencing into Dictionary content.
-                              * default : 0 when using a CDict, 1 when using a Prefix */
+    ZSTD_p_forceMaxWindow=1100, /* Force back-reference distances to remain < windowSize,
+                              * even when referencing into Dictionary content (default:0) */
+
 } ZSTD_cParameter;
 
 
@@ -987,8 +1005,8 @@
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
  *            meaning "return to no-dictionary mode".
- *  Note 1 : Dictionary content will be copied internally,
- *           except if ZSTD_p_refDictContent is set.
+ *  Note 1 : `dict` content will be copied internally,
+ *           except if ZSTD_p_refDictContent is set before loading.
  *  Note 2 : Loading a dictionary involves building tables, which are dependent on compression parameters.
  *           For this reason, compression parameters cannot be changed anymore after loading a dictionary.
  *           It's also a CPU-heavy operation, with non-negligible impact on latency.
@@ -997,7 +1015,7 @@
 ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
 
 /*! ZSTD_CCtx_refCDict() :
- *  Ref a prepared dictionary, to be used for all next compression jobs.
+ *  Reference a prepared dictionary, to be used for all next compression jobs.
  *  Note that compression parameters are enforced from within CDict,
  *  and supercede any compression parameter previously set within CCtx.
  *  The dictionary will remain valid for future compression jobs using same CCtx.
@@ -1010,16 +1028,18 @@
 ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
 
 /*! ZSTD_CCtx_refPrefix() :
- *  Reference a prefix (content-only dictionary) to bootstrap next compression job.
- *  Decompression will have to use same prefix.
- *  Prefix is only used once. Tables are discarded at end of compression job.
- *  If there is a need to use same prefix multiple times, consider embedding it into a ZSTD_CDict.
+ *  Reference a prefix (single-usage dictionary) for next compression job.
+ *  Decompression need same prefix to properly regenerate data.
+ *  Prefix is **only used once**. Tables are discarded at end of compression job.
+ *  Subsequent compression jobs will be done without prefix (if none is explicitly referenced).
+ *  If there is a need to use same prefix multiple times, consider embedding it into a ZSTD_CDict instead.
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
- *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous prefix, meaning "return to no-dictionary mode".
+ *  Special : Adding any prefix (including NULL) invalidates any previous prefix or dictionary
  *  Note 1 : Prefix buffer is referenced. It must outlive compression job.
  *  Note 2 : Referencing a prefix involves building tables, which are dependent on compression parameters.
- *           It's a CPU-heavy operation, with non-negligible impact on latency. */
-ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize);   /* Not ready yet ! <===================================== */
+ *           It's a CPU-heavy operation, with non-negligible impact on latency.
+ *  Note 3 : it's possible to alter ZSTD_p_dictMode using ZSTD_CCtx_setParameter() */
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize);
 
 
 
diff --git a/programs/Makefile b/programs/Makefile
index ab2db72..8b080d4 100644
--- a/programs/Makefile
+++ b/programs/Makefile
@@ -139,16 +139,13 @@
 
 $(ZSTDDECOMP_O): CFLAGS += $(ALIGN_LOOP)
 
-zstd xzstd zstd4 xzstd4 : CPPFLAGS += $(THREAD_CPP) $(ZLIBCPP)
-zstd xzstd zstd4 xzstd4 : LDFLAGS += $(THREAD_LD) $(ZLIBLD)
-xzstd xzstd4 : CPPFLAGS += $(LZMACPP)
-xzstd xzstd4 : LDFLAGS += $(LZMALD)
-zstd4 xzstd4 : CPPFLAGS += $(LZ4CPP)
-zstd4 xzstd4 : LDFLAGS += $(LZ4LD)
-zstd zstd4 : LZMA_MSG := - xz/lzma support is disabled
-zstd xzstd : LZ4_MSG := - lz4 support is disabled
-zstd xzstd zstd4 xzstd4 : CPPFLAGS += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
-zstd xzstd zstd4 xzstd4 : $(ZSTDLIB_FILES) zstdcli.o fileio.o bench.o datagen.o dibio.o
+zstd zstd4 : CPPFLAGS += $(THREAD_CPP) $(ZLIBCPP) $(LZMACPP)
+zstd zstd4 : LDFLAGS += $(THREAD_LD) $(ZLIBLD) $(LZMALD)
+zstd4 : CPPFLAGS += $(LZ4CPP)
+zstd4 : LDFLAGS += $(LZ4LD)
+zstd : LZ4_MSG := - lz4 support is disabled
+zstd zstd4 : CPPFLAGS += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
+zstd zstd4 : $(ZSTDLIB_FILES) zstdcli.o fileio.o bench.o datagen.o dibio.o
 	@echo "$(THREAD_MSG)"
 	@echo "$(ZLIB_MSG)"
 	@echo "$(LZMA_MSG)"
@@ -181,6 +178,11 @@
 zstd-nogz : ZLIB_MSG := - gzip support is disabled
 zstd-nogz : zstd
 
+zstd-noxz : LZMACPP :=
+zstd-noxz : LZMALD :=
+zstd-noxz : LZMA_MSG := - xz/lzma support is disabled
+zstd-noxz : zstd
+
 
 zstd-pgo : MOREFLAGS = -fprofile-generate
 zstd-pgo : clean zstd
diff --git a/programs/README.md b/programs/README.md
index d7922a0..bd8fba0 100644
--- a/programs/README.md
+++ b/programs/README.md
@@ -24,13 +24,23 @@
 - __HAVE_ZLIB__ : `zstd` can compress and decompress files in `.gz` format.
   This is done through command `--format=gzip`.
   Alternatively, symlinks named `gzip` or `gunzip` will mimic intended behavior.
-  .gz support is automatically enabled when `zlib` library is detected at build time.
-  It's possible to disable .gz support, by either compiling `zstd-nogz` target or using HAVE_ZLIB=0 variable.
+  `.gz` support is automatically enabled when `zlib` library is detected at build time.
+  It's possible to disable `.gz` support, by either compiling `zstd-nogz` target or using HAVE_ZLIB=0 variable.
   Example : make zstd HAVE_ZLIB=0
   It's also possible to force compilation with zlib support, using HAVE_ZLIB=1.
   In which case, linking stage will fail if `zlib` library cannot be found.
   This might be useful to prevent silent feature disabling.
 
+- __HAVE_LZMA__ : `zstd` can compress and decompress files in `.xz` and `.lzma` formats.
+  This is done through commands `--format=xz` and `--format=lzma` respectively.
+  Alternatively, symlinks named `xz`, `unxz`, `lzma`, or `unlzma` will mimic intended behavior.
+  `.xz` and `.lzma` support is automatically enabled when `lzma` library is detected at build time.
+  It's possible to disable `.xz` and `.lzma` support, by either compiling `zstd-noxz` target or using HAVE_LZMA=0 variable.
+  Example : make zstd HAVE_LZMA=0
+  It's also possible to force compilation with lzma support, using HAVE_LZMA=1.
+  In which case, linking stage will fail if `lzma` library cannot be found.
+  This might be useful to prevent silent feature disabling.
+
 
 #### Aggregation of parameters
 CLI supports aggregation of parameters i.e. `-b1`, `-e18`, and `-i1` can be joined into `-b1e18i1`.
diff --git a/programs/dibio.c b/programs/dibio.c
index aac3642..31cde5c 100644
--- a/programs/dibio.c
+++ b/programs/dibio.c
@@ -216,21 +216,21 @@
 }
 
 
-/*! ZDICT_trainFromBuffer_unsafe() :
+/*! ZDICT_trainFromBuffer_unsafe_legacy() :
     Strictly Internal use only !!
-    Same as ZDICT_trainFromBuffer_advanced(), but does not control `samplesBuffer`.
+    Same as ZDICT_trainFromBuffer_legacy(), but does not control `samplesBuffer`.
     `samplesBuffer` must be followed by noisy guard band to avoid out-of-buffer reads.
     @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
               or an error code.
 */
-size_t ZDICT_trainFromBuffer_unsafe(void* dictBuffer, size_t dictBufferCapacity,
-                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                              ZDICT_params_t parameters);
+size_t ZDICT_trainFromBuffer_unsafe_legacy(void* dictBuffer, size_t dictBufferCapacity,
+                                           const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                                           ZDICT_legacy_params_t parameters);
 
 
 int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
                        const char** fileNamesTable, unsigned nbFiles,
-                       ZDICT_params_t *params, COVER_params_t *coverParams,
+                       ZDICT_legacy_params_t *params, ZDICT_cover_params_t *coverParams,
                        int optimizeCover)
 {
     void* const dictBuffer = malloc(maxDictSize);
@@ -243,8 +243,8 @@
     int result = 0;
 
     /* Checks */
-    if (params) g_displayLevel = params->notificationLevel;
-    else if (coverParams) g_displayLevel = coverParams->notificationLevel;
+    if (params) g_displayLevel = params->zParams.notificationLevel;
+    else if (coverParams) g_displayLevel = coverParams->zParams.notificationLevel;
     else EXM_THROW(13, "Neither dictionary algorith selected");   /* should not happen */
     if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles");   /* should not happen */
     if (g_tooLargeSamples) {
@@ -273,20 +273,20 @@
         size_t dictSize;
         if (params) {
             DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH);   /* guard band, for end of buffer condition */
-            dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
-                                                    srcBuffer, fileSizes, nbFiles,
-                                                    *params);
+            dictSize = ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, maxDictSize,
+                                                           srcBuffer, fileSizes, nbFiles,
+                                                           *params);
         } else if (optimizeCover) {
-            dictSize = COVER_optimizeTrainFromBuffer(
-                dictBuffer, maxDictSize, srcBuffer, fileSizes, nbFiles,
-                coverParams);
+            dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
+                                                           srcBuffer, fileSizes, nbFiles,
+                                                           coverParams);
             if (!ZDICT_isError(dictSize)) {
-              DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\n", coverParams->k, coverParams->d, coverParams->steps);
+                DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\n", coverParams->k, coverParams->d, coverParams->steps);
             }
         } else {
-            dictSize = COVER_trainFromBuffer(dictBuffer, maxDictSize,
-                                             srcBuffer, fileSizes, nbFiles,
-                                             *coverParams);
+            dictSize =
+                ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
+                                            fileSizes, nbFiles, *coverParams);
         }
         if (ZDICT_isError(dictSize)) {
             DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
diff --git a/programs/dibio.h b/programs/dibio.h
index e61d004..84f7d58 100644
--- a/programs/dibio.h
+++ b/programs/dibio.h
@@ -32,7 +32,7 @@
 */
 int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
                        const char** fileNamesTable, unsigned nbFiles,
-                       ZDICT_params_t *params, COVER_params_t *coverParams,
+                       ZDICT_legacy_params_t *params, ZDICT_cover_params_t *coverParams,
                        int optimizeCover);
 
 #endif
diff --git a/programs/fileio.c b/programs/fileio.c
index 78ac3ba..7794371 100644
--- a/programs/fileio.c
+++ b/programs/fileio.c
@@ -559,7 +559,7 @@
 
     strm.next_in = 0;
     strm.avail_in = 0;
-    strm.next_out = ress->dstBuffer;
+    strm.next_out = (BYTE*)ress->dstBuffer;
     strm.avail_out = ress->dstBufferSize;
 
     while (1) {
@@ -567,7 +567,7 @@
             size_t const inSize = fread(ress->srcBuffer, 1, ress->srcBufferSize, ress->srcFile);
             if (inSize == 0) action = LZMA_FINISH;
             inFileSize += inSize;
-            strm.next_in = ress->srcBuffer;
+            strm.next_in = (BYTE const*)ress->srcBuffer;
             strm.avail_in = inSize;
         }
 
@@ -580,7 +580,7 @@
                 if (fwrite(ress->dstBuffer, 1, compBytes, ress->dstFile) != compBytes)
                     EXM_THROW(73, "Write error : cannot write to output file");
                 outFileSize += compBytes;
-                strm.next_out = ress->dstBuffer;
+                strm.next_out = (BYTE*)ress->dstBuffer;
                 strm.avail_out = ress->dstBufferSize;
         }   }
         if (!srcFileSize)
@@ -1490,16 +1490,16 @@
         EXM_THROW(71, "zstd: %s: lzma_alone_decoder/lzma_stream_decoder error %d",
                         srcFileName, ret);
 
-    strm.next_out = ress->dstBuffer;
+    strm.next_out = (BYTE*)ress->dstBuffer;
     strm.avail_out = ress->dstBufferSize;
+    strm.next_in = (BYTE const*)ress->srcBuffer;
     strm.avail_in = ress->srcBufferLoaded;
-    strm.next_in = ress->srcBuffer;
 
     for ( ; ; ) {
         if (strm.avail_in == 0) {
             ress->srcBufferLoaded = fread(ress->srcBuffer, 1, ress->srcBufferSize, srcFile);
             if (ress->srcBufferLoaded == 0) action = LZMA_FINISH;
-            strm.next_in = ress->srcBuffer;
+            strm.next_in = (BYTE const*)ress->srcBuffer;
             strm.avail_in = ress->srcBufferLoaded;
         }
         ret = lzma_code(&strm, action);
@@ -1515,7 +1515,7 @@
                 if (fwrite(ress->dstBuffer, 1, decompBytes, ress->dstFile) != decompBytes)
                     EXM_THROW(31, "Write error : cannot write to output file");
                 outFileSize += decompBytes;
-                strm.next_out = ress->dstBuffer;
+                strm.next_out = (BYTE*)ress->dstBuffer;
                 strm.avail_out = ress->dstBufferSize;
         }   }
         if (ret == LZMA_STREAM_END) break;
diff --git a/programs/zstdcli.c b/programs/zstdcli.c
index 5794386..35772e0 100644
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@@ -56,7 +56,9 @@
 #define ZSTD_GUNZIP "gunzip"
 #define ZSTD_GZCAT "gzcat"
 #define ZSTD_LZMA "lzma"
+#define ZSTD_UNLZMA "unlzma"
 #define ZSTD_XZ "xz"
+#define ZSTD_UNXZ "unxz"
 
 #define KB *(1 <<10)
 #define MB *(1 <<20)
@@ -246,7 +248,7 @@
  * @return 1 means that cover parameters were correct
  * @return 0 in case of malformed parameters
  */
-static unsigned parseCoverParameters(const char* stringPtr, COVER_params_t* params)
+static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t* params)
 {
     memset(params, 0, sizeof(*params));
     for (; ;) {
@@ -275,9 +277,9 @@
     return 1;
 }
 
-static COVER_params_t defaultCoverParams(void)
+static ZDICT_cover_params_t defaultCoverParams(void)
 {
-    COVER_params_t params;
+    ZDICT_cover_params_t params;
     memset(&params, 0, sizeof(params));
     params.d = 8;
     params.steps = 4;
@@ -356,7 +358,7 @@
     unsigned fileNamesNb;
 #endif
 #ifndef ZSTD_NODICT
-    COVER_params_t coverParams = defaultCoverParams();
+    ZDICT_cover_params_t coverParams = defaultCoverParams();
     int cover = 1;
 #endif
 
@@ -379,7 +381,9 @@
     if (exeNameMatch(programName, ZSTD_GUNZIP)) { operation=zom_decompress; FIO_setRemoveSrcFile(1); }                                          /* behave like gunzip */
     if (exeNameMatch(programName, ZSTD_GZCAT)) { operation=zom_decompress; forceStdout=1; FIO_overwriteMode(); outFileName=stdoutmark; g_displayLevel=1; }  /* behave like gzcat */
     if (exeNameMatch(programName, ZSTD_LZMA)) { suffix = LZMA_EXTENSION; FIO_setCompressionType(FIO_lzmaCompression); FIO_setRemoveSrcFile(1); }    /* behave like lzma */
+    if (exeNameMatch(programName, ZSTD_UNLZMA)) { operation=zom_decompress; FIO_setCompressionType(FIO_lzmaCompression); FIO_setRemoveSrcFile(1); }    /* behave like unlzma */
     if (exeNameMatch(programName, ZSTD_XZ)) { suffix = XZ_EXTENSION; FIO_setCompressionType(FIO_xzCompression); FIO_setRemoveSrcFile(1); }    /* behave like xz */
+    if (exeNameMatch(programName, ZSTD_UNXZ)) { operation=zom_decompress; FIO_setCompressionType(FIO_xzCompression); FIO_setRemoveSrcFile(1); }    /* behave like unxz */
     memset(&compressionParams, 0, sizeof(compressionParams));
 
     /* command switches */
@@ -695,20 +699,20 @@
     /* Check if dictionary builder is selected */
     if (operation==zom_train) {
 #ifndef ZSTD_NODICT
+        ZDICT_params_t zParams;
+        zParams.compressionLevel = dictCLevel;
+        zParams.notificationLevel = g_displayLevel;
+        zParams.dictID = dictID;
         if (cover) {
             int const optimize = !coverParams.k || !coverParams.d;
             coverParams.nbThreads = nbThreads;
-            coverParams.compressionLevel = dictCLevel;
-            coverParams.notificationLevel = g_displayLevel;
-            coverParams.dictID = dictID;
+            coverParams.zParams = zParams;
             operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, optimize);
         } else {
-            ZDICT_params_t dictParams;
+            ZDICT_legacy_params_t dictParams;
             memset(&dictParams, 0, sizeof(dictParams));
-            dictParams.compressionLevel = dictCLevel;
             dictParams.selectivityLevel = dictSelect;
-            dictParams.notificationLevel = g_displayLevel;
-            dictParams.dictID = dictID;
+            dictParams.zParams = zParams;
             operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0);
         }
 #endif
diff --git a/tests/Makefile b/tests/Makefile
index c116404..5c24080 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -180,7 +180,7 @@
 legacy : $(ZSTD_FILES) $(wildcard $(ZSTDDIR)/legacy/*.c) legacy.c
 	$(CC)      $(FLAGS) $^ -o $@$(EXT)
 
-decodecorpus	: $(filter-out $(ZSTDDIR)/compress/zstd_compress.c, $(wildcard $(ZSTD_FILES))) decodecorpus.c
+decodecorpus	: $(filter-out $(ZSTDDIR)/compress/zstd_compress.c, $(wildcard $(ZSTD_FILES))) $(ZDICT_FILES) decodecorpus.c
 	$(CC)      $(FLAGS) $^ -o $@$(EXT) -lm
 
 symbols  : symbols.c
@@ -272,7 +272,7 @@
 
 test32: test-zstd32 test-fullbench32 test-fuzzer32 test-zstream32
 
-test-all: test test32 valgrindTest
+test-all: test test32 valgrindTest test-decodecorpus-cli
 
 test-zstd: ZSTD = $(PRGDIR)/zstd
 test-zstd: zstd zstd-playTests
@@ -342,6 +342,39 @@
 test-decodecorpus: decodecorpus
 	$(QEMU_SYS) ./decodecorpus -t $(DECODECORPUS_TESTTIME)
 
+test-decodecorpus-cli: decodecorpus
+	@echo "\n ---- decodecorpus basic cli tests ----"
+	@mkdir testdir
+	./decodecorpus -n5 -otestdir -ptestdir
+	@cd testdir && \
+	$(ZSTD) -d z000000.zst -o tmp0 && \
+	$(ZSTD) -d z000001.zst -o tmp1 && \
+	$(ZSTD) -d z000002.zst -o tmp2 && \
+	$(ZSTD) -d z000003.zst -o tmp3 && \
+	$(ZSTD) -d z000004.zst -o tmp4 && \
+	diff z000000 tmp0 && \
+	diff z000001 tmp1 && \
+	diff z000002 tmp2 && \
+	diff z000003 tmp3 && \
+	diff z000004 tmp4 && \
+	rm ./* && \
+	cd ..
+	@echo "\n ---- decodecorpus dictionary cli tests ----"
+	./decodecorpus -n5 -otestdir -ptestdir --use-dict=1MB
+	@cd testdir && \
+	$(ZSTD) -d z000000.zst -D dictionary -o tmp0 && \
+	$(ZSTD) -d z000001.zst -D dictionary -o tmp1 && \
+	$(ZSTD) -d z000002.zst -D dictionary -o tmp2 && \
+	$(ZSTD) -d z000003.zst -D dictionary -o tmp3 && \
+	$(ZSTD) -d z000004.zst -D dictionary -o tmp4 && \
+	diff z000000 tmp0 && \
+	diff z000001 tmp1 && \
+	diff z000002 tmp2 && \
+	diff z000003 tmp3 && \
+	diff z000004 tmp4 && \
+	cd ..
+	@rm -rf testdir
+
 test-pool: pool
 	$(QEMU_SYS) ./pool
 
diff --git a/tests/decodecorpus.c b/tests/decodecorpus.c
index 940e3d8..eaf0745 100644
--- a/tests/decodecorpus.c
+++ b/tests/decodecorpus.c
@@ -18,6 +18,8 @@
 #include "zstd.h"
 #include "zstd_internal.h"
 #include "mem.h"
+#define ZDICT_STATIC_LINKING_ONLY
+#include "zdict.h"
 
 // Direct access to internal compression functions is required
 #include "zstd_compress.c"
@@ -229,6 +231,12 @@
     cblockStats_t oldStats; /* so they can be rolled back if uncompressible */
 } frame_t;
 
+typedef struct {
+    int useDict;
+    U32 dictID;
+    size_t dictContentSize;
+    BYTE* dictContent;
+} dictInfo;
 /*-*******************************************************
 *  Generator Functions
 *********************************************************/
@@ -238,7 +246,7 @@
 } opts; /* advanced options on generation */
 
 /* Generate and write a random frame header */
-static void writeFrameHeader(U32* seed, frame_t* frame)
+static void writeFrameHeader(U32* seed, frame_t* frame, dictInfo info)
 {
     BYTE* const op = frame->data;
     size_t pos = 0;
@@ -304,15 +312,26 @@
     pos += 4;
 
     {
+        /*
+         * fcsCode: 2-bit flag specifying how many bytes used to represent Frame_Content_Size (bits 7-6)
+         * singleSegment: 1-bit flag describing if data must be regenerated within a single continuous memory segment. (bit 5)
+         * contentChecksumFlag: 1-bit flag that is set if frame includes checksum at the end -- set to 1 below (bit 2)
+         * dictBits: 2-bit flag describing how many bytes Dictionary_ID uses -- set to 3 (bits 1-0)
+         * For more information: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header
+         */
+        int const dictBits = info.useDict ? 3 : 0;
         BYTE const frameHeaderDescriptor =
-                (BYTE) ((fcsCode << 6) | (singleSegment << 5) | (1 << 2));
+                (BYTE) ((fcsCode << 6) | (singleSegment << 5) | (1 << 2) | dictBits);
         op[pos++] = frameHeaderDescriptor;
     }
 
     if (!singleSegment) {
         op[pos++] = windowByte;
     }
-
+    if (info.useDict) {
+        MEM_writeLE32(op + pos, (U32) info.dictID);
+        pos += 4;
+    }
     if (contentSizeFlag) {
         switch (fcsCode) {
         default: /* Impossible */
@@ -603,7 +622,7 @@
 
 /* Randomly generate sequence commands */
 static U32 generateSequences(U32* seed, frame_t* frame, seqStore_t* seqStore,
-                                size_t contentSize, size_t literalsSize)
+                                size_t contentSize, size_t literalsSize, dictInfo info)
 {
     /* The total length of all the matches */
     size_t const remainingMatch = contentSize - literalsSize;
@@ -627,7 +646,6 @@
     }
 
     DISPLAYLEVEL(5, "    total match lengths: %u\n", (U32)remainingMatch);
-
     for (i = 0; i < numSequences; i++) {
         /* Generate match and literal lengths by exponential distribution to
          * ensure nice numbers */
@@ -652,14 +670,33 @@
 
         memcpy(srcPtr, literals, literalLen);
         srcPtr += literalLen;
-
         do {
             if (RAND(seed) & 7) {
                 /* do a normal offset */
+                U32 const dataDecompressed = (U32)((BYTE*)srcPtr-(BYTE*)frame->srcStart);
                 offset = (RAND(seed) %
                           MIN(frame->header.windowSize,
                               (size_t)((BYTE*)srcPtr - (BYTE*)frame->srcStart))) +
                          1;
+                if (info.useDict && (RAND(seed) & 1) && i + 1 != numSequences && dataDecompressed < frame->header.windowSize) {
+                    /* need to occasionally generate offsets that go past the start */
+                    /* including i+1 != numSequences because the last sequences has to adhere to predetermined contentSize */
+                    U32 lenPastStart = (RAND(seed) % info.dictContentSize) + 1;
+                    offset = (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart)+lenPastStart;
+                    if (offset > frame->header.windowSize) {
+                        if (lenPastStart < MIN_SEQ_LEN) {
+                            /* when offset > windowSize, matchLen bound by end of dictionary (lenPastStart) */
+                            /* this also means that lenPastStart must be greater than MIN_SEQ_LEN */
+                            /* make sure lenPastStart does not go past dictionary start though */
+                            lenPastStart = MIN(lenPastStart+MIN_SEQ_LEN, (U32)info.dictContentSize);
+                            offset = (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart) + lenPastStart;
+                        }
+                        {
+                            U32 const matchLenBound = MIN(frame->header.windowSize, lenPastStart);
+                            matchLen = MIN(matchLen, matchLenBound);
+                        }
+                    }
+                }
                 offsetCode = offset + ZSTD_REP_MOVE;
                 repIndex = 2;
             } else {
@@ -675,11 +712,20 @@
                     repIndex = MIN(2, offsetCode + 1);
                 }
             }
-        } while (offset > (size_t)((BYTE*)srcPtr - (BYTE*)frame->srcStart) || offset == 0);
+        } while (((!info.useDict) && (offset > (size_t)((BYTE*)srcPtr - (BYTE*)frame->srcStart))) || offset == 0);
 
-        {   size_t j;
+        {
+            size_t j;
+            BYTE* const dictEnd = info.dictContent + info.dictContentSize;
             for (j = 0; j < matchLen; j++) {
-                *srcPtr = *(srcPtr-offset);
+                if ((U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart) < offset) {
+                    /* copy from dictionary instead of literals */
+                    size_t const dictOffset = offset - (srcPtr - (BYTE*)frame->srcStart);
+                    *srcPtr = *(dictEnd - dictOffset);
+                }
+                else {
+                    *srcPtr = *(srcPtr-offset);
+                }
                 srcPtr++;
             }
         }
@@ -929,7 +975,7 @@
 }
 
 static size_t writeSequencesBlock(U32* seed, frame_t* frame, size_t contentSize,
-                                  size_t literalsSize)
+                                  size_t literalsSize, dictInfo info)
 {
     seqStore_t seqStore;
     size_t numSequences;
@@ -938,14 +984,14 @@
     initSeqStore(&seqStore);
 
     /* randomly generate sequences */
-    numSequences = generateSequences(seed, frame, &seqStore, contentSize, literalsSize);
+    numSequences = generateSequences(seed, frame, &seqStore, contentSize, literalsSize, info);
     /* write them out to the frame data */
     CHECKERR(writeSequences(seed, frame, &seqStore, numSequences));
 
     return numSequences;
 }
 
-static size_t writeCompressedBlock(U32* seed, frame_t* frame, size_t contentSize)
+static size_t writeCompressedBlock(U32* seed, frame_t* frame, size_t contentSize, dictInfo info)
 {
     BYTE* const blockStart = (BYTE*)frame->data;
     size_t literalsSize;
@@ -957,7 +1003,7 @@
 
     DISPLAYLEVEL(4, "   literals size: %u\n", (U32)literalsSize);
 
-    nbSeq = writeSequencesBlock(seed, frame, contentSize, literalsSize);
+    nbSeq = writeSequencesBlock(seed, frame, contentSize, literalsSize, info);
 
     DISPLAYLEVEL(4, "   number of sequences: %u\n", (U32)nbSeq);
 
@@ -965,7 +1011,7 @@
 }
 
 static void writeBlock(U32* seed, frame_t* frame, size_t contentSize,
-                       int lastBlock)
+                       int lastBlock, dictInfo info)
 {
     int const blockTypeDesc = RAND(seed) % 8;
     size_t blockSize;
@@ -1005,7 +1051,7 @@
         frame->oldStats = frame->stats;
 
         frame->data = op;
-        compressedSize = writeCompressedBlock(seed, frame, contentSize);
+        compressedSize = writeCompressedBlock(seed, frame, contentSize, info);
         if (compressedSize > contentSize) {
             blockType = 0;
             memcpy(op, frame->src, contentSize);
@@ -1031,7 +1077,7 @@
     frame->data = op;
 }
 
-static void writeBlocks(U32* seed, frame_t* frame)
+static void writeBlocks(U32* seed, frame_t* frame, dictInfo info)
 {
     size_t contentLeft = frame->header.contentSize;
     size_t const maxBlockSize = MIN(MAX_BLOCK_SIZE, frame->header.windowSize);
@@ -1054,7 +1100,7 @@
             }
         }
 
-        writeBlock(seed, frame, blockContentSize, lastBlock);
+        writeBlock(seed, frame, blockContentSize, lastBlock, info);
 
         contentLeft -= blockContentSize;
         if (lastBlock) break;
@@ -1119,20 +1165,102 @@
 }
 
 /* Return the final seed */
-static U32 generateFrame(U32 seed, frame_t* fr)
+static U32 generateFrame(U32 seed, frame_t* fr, dictInfo info)
 {
     /* generate a complete frame */
     DISPLAYLEVEL(1, "frame seed: %u\n", seed);
-
     initFrame(fr);
 
-    writeFrameHeader(&seed, fr);
-    writeBlocks(&seed, fr);
+    writeFrameHeader(&seed, fr, info);
+    writeBlocks(&seed, fr, info);
     writeChecksum(fr);
 
     return seed;
 }
 
+/*_*******************************************************
+*  Dictionary Helper Functions
+*********************************************************/
+/* returns 0 if successful, otherwise returns 1 upon error */
+static int genRandomDict(U32 dictID, U32 seed, size_t dictSize, BYTE* fullDict){
+    /* allocate space for samples */
+    int ret = 0;
+    unsigned const numSamples = 4;
+    size_t sampleSizes[4];
+    BYTE* const samples = malloc(5000*sizeof(BYTE));
+    if (samples == NULL) {
+        DISPLAY("Error: could not allocate space for samples\n");
+        return 1;
+    }
+
+    /* generate samples */
+    {
+        unsigned literalValue = 1;
+        unsigned samplesPos = 0;
+        size_t currSize = 1;
+        while (literalValue <= 4) {
+            sampleSizes[literalValue - 1] = currSize;
+            {
+                size_t k;
+                for (k = 0; k < currSize; k++) {
+                    *(samples + (samplesPos++)) = (BYTE)literalValue;
+                }
+            }
+            literalValue++;
+            currSize *= 16;
+        }
+    }
+
+
+    {
+        /* create variables */
+        size_t dictWriteSize = 0;
+        ZDICT_params_t zdictParams;
+        size_t const headerSize = MAX(dictSize/4, 256);
+        size_t const dictContentSize = dictSize - headerSize;
+        BYTE* const dictContent = fullDict + headerSize;
+        if (dictContentSize < ZDICT_CONTENTSIZE_MIN || dictSize < ZDICT_DICTSIZE_MIN) {
+            DISPLAY("Error: dictionary size is too small\n");
+            ret = 1;
+            goto exitGenRandomDict;
+        }
+
+        /* init dictionary params */
+        memset(&zdictParams, 0, sizeof(zdictParams));
+        zdictParams.dictID = dictID;
+        zdictParams.notificationLevel = 1;
+
+        /* fill in dictionary content */
+        RAND_buffer(&seed, (void*)dictContent, dictContentSize);
+
+        /* finalize dictionary with random samples */
+        dictWriteSize = ZDICT_finalizeDictionary(fullDict, dictSize,
+                                    dictContent, dictContentSize,
+                                    samples, sampleSizes, numSamples,
+                                    zdictParams);
+
+        if (ZDICT_isError(dictWriteSize)) {
+            DISPLAY("Could not finalize dictionary: %s\n", ZDICT_getErrorName(dictWriteSize));
+            ret = 1;
+        }
+    }
+
+exitGenRandomDict:
+    free(samples);
+    return ret;
+}
+
+static dictInfo initDictInfo(int useDict, size_t dictContentSize, BYTE* dictContent, U32 dictID){
+    /* allocate space statically */
+    dictInfo dictOp;
+    memset(&dictOp, 0, sizeof(dictOp));
+    dictOp.useDict = useDict;
+    dictOp.dictContentSize = dictContentSize;
+    dictOp.dictContent = dictContent;
+    dictOp.dictID = dictID;
+    return dictOp;
+}
+
 /*-*******************************************************
 *  Test Mode
 *********************************************************/
@@ -1194,6 +1322,65 @@
     return ret;
 }
 
+static size_t testDecodeWithDict(U32 seed)
+{
+    /* create variables */
+    size_t const dictSize = RAND(&seed) % (10 << 20) + ZDICT_DICTSIZE_MIN + ZDICT_CONTENTSIZE_MIN;
+    U32 const dictID = RAND(&seed);
+    size_t errorDetected = 0;
+    BYTE* const fullDict = malloc(dictSize);
+    if (fullDict == NULL) {
+        return ERROR(GENERIC);
+    }
+
+    /* generate random dictionary */
+    {
+        int const ret = genRandomDict(dictID, seed, dictSize, fullDict);
+        if (ret != 0) {
+            errorDetected = ERROR(GENERIC);
+            goto dictTestCleanup;
+        }
+    }
+
+
+    {
+        frame_t fr;
+
+        /* generate frame */
+        {
+            size_t const headerSize = MAX(dictSize/4, 256);
+            size_t const dictContentSize = dictSize-headerSize;
+            BYTE* const dictContent = fullDict+headerSize;
+            dictInfo const info = initDictInfo(1, dictContentSize, dictContent, dictID);
+            seed = generateFrame(seed, &fr, info);
+        }
+
+        /* manually decompress and check difference */
+        {
+            ZSTD_DCtx* const dctx = ZSTD_createDCtx();
+            {
+                size_t const returnValue = ZSTD_decompress_usingDict(dctx, DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
+                                                       fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart,
+                                                       fullDict, dictSize);
+                if (ZSTD_isError(returnValue)) {
+                    errorDetected = returnValue;
+                    goto dictTestCleanup;
+                }
+            }
+
+            if (memcmp(DECOMPRESSED_BUFFER, fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart) != 0) {
+                errorDetected = ERROR(corruption_detected);
+                goto dictTestCleanup;
+            }
+            ZSTD_freeDCtx(dctx);
+        }
+    }
+
+dictTestCleanup:
+    free(fullDict);
+    return errorDetected;
+}
+
 static int runTestMode(U32 seed, unsigned numFiles, unsigned const testDurationS)
 {
     unsigned fnum;
@@ -1207,28 +1394,39 @@
 
     for (fnum = 0; fnum < numFiles || clockSpan(startClock) < maxClockSpan; fnum++) {
         frame_t fr;
-
+        U32 const seedCopy = seed;
         if (fnum < numFiles)
             DISPLAYUPDATE("\r%u/%u        ", fnum, numFiles);
         else
             DISPLAYUPDATE("\r%u           ", fnum);
 
-        seed = generateFrame(seed, &fr);
+        {
+            dictInfo const info = initDictInfo(0, 0, NULL, 0);
+            seed = generateFrame(seed, &fr, info);
+        }
 
         {   size_t const r = testDecodeSimple(&fr);
             if (ZSTD_isError(r)) {
-                DISPLAY("Error in simple mode on test seed %u: %s\n", seed + fnum,
+                DISPLAY("Error in simple mode on test seed %u: %s\n", seedCopy,
                         ZSTD_getErrorName(r));
                 return 1;
             }
         }
         {   size_t const r = testDecodeStreaming(&fr);
             if (ZSTD_isError(r)) {
-                DISPLAY("Error in streaming mode on test seed %u: %s\n", seed + fnum,
+                DISPLAY("Error in streaming mode on test seed %u: %s\n", seedCopy,
                         ZSTD_getErrorName(r));
                 return 1;
             }
         }
+        {
+            /* don't create a dictionary that is too big */
+            size_t const r = testDecodeWithDict(seed);
+            if (ZSTD_isError(r)) {
+                DISPLAY("Error in dictionary mode on test seed %u: %s\n", seedCopy, ZSTD_getErrorName(r));
+                return 1;
+            }
+        }
     }
 
     DISPLAY("\r%u tests completed: ", fnum);
@@ -1248,7 +1446,10 @@
 
     DISPLAY("seed: %u\n", seed);
 
-    generateFrame(seed, &fr);
+    {
+        dictInfo const info = initDictInfo(0, 0, NULL, 0);
+        generateFrame(seed, &fr, info);
+    }
 
     outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, path);
     if (origPath) {
@@ -1270,7 +1471,10 @@
 
         DISPLAYUPDATE("\r%u/%u        ", fnum, numFiles);
 
-        seed = generateFrame(seed, &fr);
+        {
+            dictInfo const info = initDictInfo(0, 0, NULL, 0);
+            seed = generateFrame(seed, &fr, info);
+        }
 
         if (snprintf(outPath, MAX_PATH, "%s/z%06u.zst", path, fnum) + 1 > MAX_PATH) {
             DISPLAY("Error: path too long\n");
@@ -1292,6 +1496,93 @@
     return 0;
 }
 
+static int generateCorpusWithDict(U32 seed, unsigned numFiles, const char* const path,
+                                    const char* const origPath, const size_t dictSize)
+{
+    char outPath[MAX_PATH];
+    BYTE* fullDict;
+    U32 const dictID = RAND(&seed);
+    int errorDetected = 0;
+
+    if (snprintf(outPath, MAX_PATH, "%s/dictionary", path) + 1 > MAX_PATH) {
+        DISPLAY("Error: path too long\n");
+        return 1;
+    }
+
+    /* allocate space for the dictionary */
+    fullDict = malloc(dictSize);
+    if (fullDict == NULL) {
+        DISPLAY("Error: could not allocate space for full dictionary.\n");
+        return 1;
+    }
+
+    /* randomly generate the dictionary */
+    {
+        int const ret = genRandomDict(dictID, seed, dictSize, fullDict);
+        if (ret != 0) {
+            errorDetected = ret;
+            goto dictCleanup;
+        }
+    }
+
+    /* write out dictionary */
+    if (numFiles != 0) {
+        if (snprintf(outPath, MAX_PATH, "%s/dictionary", path) + 1 > MAX_PATH) {
+            DISPLAY("Error: dictionary path too long\n");
+            errorDetected = 1;
+            goto dictCleanup;
+        }
+        outputBuffer(fullDict, dictSize, outPath);
+    }
+    else {
+        outputBuffer(fullDict, dictSize, "dictionary");
+    }
+
+    /* generate random compressed/decompressed files */
+    {
+        unsigned fnum;
+        for (fnum = 0; fnum < MAX(numFiles, 1); fnum++) {
+            frame_t fr;
+            DISPLAYUPDATE("\r%u/%u        ", fnum, numFiles);
+            {
+                size_t const headerSize = MAX(dictSize/4, 256);
+                size_t const dictContentSize = dictSize-headerSize;
+                BYTE* const dictContent = fullDict+headerSize;
+                dictInfo const info = initDictInfo(1, dictContentSize, dictContent, dictID);
+                seed = generateFrame(seed, &fr, info);
+            }
+
+            if (numFiles != 0) {
+                if (snprintf(outPath, MAX_PATH, "%s/z%06u.zst", path, fnum) + 1 > MAX_PATH) {
+                    DISPLAY("Error: path too long\n");
+                    errorDetected = 1;
+                    goto dictCleanup;
+                }
+                outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, outPath);
+
+                if (origPath) {
+                    if (snprintf(outPath, MAX_PATH, "%s/z%06u", origPath, fnum) + 1 > MAX_PATH) {
+                        DISPLAY("Error: path too long\n");
+                        errorDetected = 1;
+                        goto dictCleanup;
+                    }
+                    outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, outPath);
+                }
+            }
+            else {
+                outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, path);
+                if (origPath) {
+                    outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, origPath);
+                }
+            }
+        }
+    }
+
+dictCleanup:
+    free(fullDict);
+    return errorDetected;
+}
+
 
 /*_*******************************************************
 *  Command line
@@ -1337,6 +1628,40 @@
     DISPLAY( "\n");
     DISPLAY( "Advanced arguments :\n");
     DISPLAY( " --content-size    : always include the content size in the frame header\n");
+    DISPLAY( " --use-dict=#      : include a dictionary used to decompress the corpus\n");
+}
+
+/*! readU32FromChar() :
+    @return : unsigned integer value read from input in `char` format
+    allows and interprets K, KB, KiB, M, MB and MiB suffix.
+    Will also modify `*stringPtr`, advancing it to position where it stopped reading.
+    Note : function result can overflow if digit string > MAX_UINT */
+static unsigned readU32FromChar(const char** stringPtr)
+{
+    unsigned result = 0;
+    while ((**stringPtr >='0') && (**stringPtr <='9'))
+        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    if ((**stringPtr=='K') || (**stringPtr=='M')) {
+        result <<= 10;
+        if (**stringPtr=='M') result <<= 10;
+        (*stringPtr)++ ;
+        if (**stringPtr=='i') (*stringPtr)++;
+        if (**stringPtr=='B') (*stringPtr)++;
+    }
+    return result;
+}
+
+/** longCommandWArg() :
+ *  check if *stringPtr is the same as longCommand.
+ *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
+ *  @return 0 and doesn't modify *stringPtr otherwise.
+ */
+static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
+{
+    size_t const comSize = strlen(longCommand);
+    int const result = !strncmp(*stringPtr, longCommand, comSize);
+    if (result) *stringPtr += comSize;
+    return result;
 }
 
 int main(int argc, char** argv)
@@ -1348,6 +1673,8 @@
     int testMode = 0;
     const char* path = NULL;
     const char* origPath = NULL;
+    int useDict = 0;
+    unsigned dictSize = (10 << 10); /* 10 kB default */
 
     int argNb;
 
@@ -1408,6 +1735,9 @@
                     argument++;
                     if (strcmp(argument, "content-size") == 0) {
                         opts.contentSize = 1;
+                    } else if (longCommandWArg(&argument, "use-dict=")) {
+                        dictSize = readU32FromChar(&argument);
+                        useDict = 1;
                     } else {
                         advancedUsage(argv[0]);
                         return 1;
@@ -1439,9 +1769,13 @@
         return 1;
     }
 
-    if (numFiles == 0) {
+    if (numFiles == 0 && useDict == 0) {
         return generateFile(seed, path, origPath);
-    } else {
+    } else if (useDict == 0){
         return generateCorpus(seed, numFiles, path, origPath);
+    } else {
+        /* should generate files with a dictionary */
+        return generateCorpusWithDict(seed, numFiles, path, origPath, dictSize);
     }
+
 }
diff --git a/tests/fuzzer.c b/tests/fuzzer.c
index a3a56d9..b8f5147 100644
--- a/tests/fuzzer.c
+++ b/tests/fuzzer.c
@@ -193,8 +193,7 @@
     /* Static CCtx tests */
 #define STATIC_CCTX_LEVEL 3
     DISPLAYLEVEL(4, "test%3i : create static CCtx for level %u :", testNb++, STATIC_CCTX_LEVEL);
-    {   ZSTD_compressionParameters const cParams = ZSTD_getCParams(STATIC_CCTX_LEVEL, 0, 0);
-        size_t const staticCCtxSize = ZSTD_estimateCStreamSize(cParams);
+    {   size_t const staticCCtxSize = ZSTD_estimateCStreamSize(STATIC_CCTX_LEVEL);
         void* const staticCCtxBuffer = malloc(staticCCtxSize);
         size_t const staticDCtxSize = ZSTD_estimateDCtxSize();
         void* const staticDCtxBuffer = malloc(staticDCtxSize);
@@ -502,7 +501,7 @@
 
         DISPLAYLEVEL(4, "test%3i : estimate CDict size : ", testNb++);
         {   ZSTD_compressionParameters const cParams = ZSTD_getCParams(1, CNBuffSize, dictSize);
-            size_t const estimatedSize = ZSTD_estimateCDictSize(cParams, dictSize, 1 /*byReference*/);
+            size_t const estimatedSize = ZSTD_estimateCDictSize_advanced(dictSize, cParams, 1 /*byReference*/);
             DISPLAYLEVEL(4, "OK : %u \n", (U32)estimatedSize);
         }
 
@@ -535,7 +534,7 @@
 
         DISPLAYLEVEL(4, "test%3i : compress with static CDict : ", testNb++);
         {   ZSTD_compressionParameters const cParams = ZSTD_getCParams(1, CNBuffSize, dictSize);
-            size_t const cdictSize = ZSTD_estimateCDictSize(cParams, dictSize, 0);
+            size_t const cdictSize = ZSTD_estimateCDictSize_advanced(dictSize, cParams, 0);
             void* const cdictBuffer = malloc(cdictSize);
             if (cdictBuffer==NULL) goto _output_error;
             {   ZSTD_CDict* const cdict = ZSTD_initStaticCDict(cdictBuffer, cdictSize,
@@ -639,7 +638,7 @@
         size_t const sampleUnitSize = 8 KB;
         U32 const nbSamples = (U32)(totalSampleSize / sampleUnitSize);
         size_t* const samplesSizes = (size_t*) malloc(nbSamples * sizeof(size_t));
-        COVER_params_t params;
+        ZDICT_cover_params_t params;
         U32 dictID;
 
         if (dictBuffer==NULL || samplesSizes==NULL) {
@@ -648,14 +647,14 @@
             goto _output_error;
         }
 
-        DISPLAYLEVEL(4, "test%3i : COVER_trainFromBuffer : ", testNb++);
+        DISPLAYLEVEL(4, "test%3i : ZDICT_trainFromBuffer_cover : ", testNb++);
         { U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
         memset(&params, 0, sizeof(params));
         params.d = 1 + (FUZ_rand(&seed) % 16);
         params.k = params.d + (FUZ_rand(&seed) % 256);
-        dictSize = COVER_trainFromBuffer(dictBuffer, dictSize,
-                                         CNBuffer, samplesSizes, nbSamples,
-                                         params);
+        dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, dictSize,
+                                               CNBuffer, samplesSizes, nbSamples,
+                                               params);
         if (ZDICT_isError(dictSize)) goto _output_error;
         DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)dictSize);
 
@@ -664,12 +663,12 @@
         if (dictID==0) goto _output_error;
         DISPLAYLEVEL(4, "OK : %u \n", dictID);
 
-        DISPLAYLEVEL(4, "test%3i : COVER_optimizeTrainFromBuffer : ", testNb++);
+        DISPLAYLEVEL(4, "test%3i : ZDICT_optimizeTrainFromBuffer_cover : ", testNb++);
         memset(&params, 0, sizeof(params));
         params.steps = 4;
-        optDictSize = COVER_optimizeTrainFromBuffer(dictBuffer, optDictSize,
-                                                    CNBuffer, samplesSizes, nbSamples / 4,
-                                                    &params);
+        optDictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, optDictSize,
+                                                          CNBuffer, samplesSizes,
+                                                          nbSamples / 4, &params);
         if (ZDICT_isError(optDictSize)) goto _output_error;
         DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)optDictSize);
 
diff --git a/tests/paramgrill.c b/tests/paramgrill.c
index 1185c66..da06ccb 100644
--- a/tests/paramgrill.c
+++ b/tests/paramgrill.c
@@ -390,8 +390,8 @@
             double W_DMemUsed_note = W_ratioNote * ( 40 + 9*cLevel) - log((double)W_DMemUsed);
             double O_DMemUsed_note = O_ratioNote * ( 40 + 9*cLevel) - log((double)O_DMemUsed);
 
-            size_t W_CMemUsed = (1 << params.windowLog) + ZSTD_estimateCCtxSize(params);
-            size_t O_CMemUsed = (1 << winners[cLevel].params.windowLog) + ZSTD_estimateCCtxSize(winners[cLevel].params);
+            size_t W_CMemUsed = (1 << params.windowLog) + ZSTD_estimateCCtxSize_advanced(params);
+            size_t O_CMemUsed = (1 << winners[cLevel].params.windowLog) + ZSTD_estimateCCtxSize_advanced(winners[cLevel].params);
             double W_CMemUsed_note = W_ratioNote * ( 50 + 13*cLevel) - log((double)W_CMemUsed);
             double O_CMemUsed_note = O_ratioNote * ( 50 + 13*cLevel) - log((double)O_CMemUsed);
 
diff --git a/tests/playTests.sh b/tests/playTests.sh
index fa82ae9..2e1cc68 100755
--- a/tests/playTests.sh
+++ b/tests/playTests.sh
@@ -442,6 +442,7 @@
     XZEXE=1
     xz -V && lzma -V || XZEXE=0
     if [ $XZEXE -eq 1 ]; then
+        $ECHO "Testing zstd xz and lzma support"
         ./datagen > tmp
         $ZSTD --format=lzma -f tmp
         $ZSTD --format=xz -f tmp
@@ -452,6 +453,24 @@
         $ZSTD -d -f -v tmp.xz
         $ZSTD -d -f -v tmp.lzma
         rm tmp*
+        $ECHO "Creating symlinks"
+        ln -s $ZSTD ./xz
+        ln -s $ZSTD ./unxz
+        ln -s $ZSTD ./lzma
+        ln -s $ZSTD ./unlzma
+        $ECHO "Testing xz and lzma symlinks"
+        ./datagen > tmp
+        ./xz tmp
+        xz -d tmp.xz
+        ./lzma tmp
+        lzma -d tmp.lzma
+        $ECHO "Testing unxz and unlzma symlinks"
+        xz tmp
+        ./xz -d tmp.xz
+        lzma tmp
+        ./lzma -d tmp.lzma
+        rm xz unxz lzma unlzma
+        rm tmp*
     else
         $ECHO "xz binary not detected"
     fi
diff --git a/tests/symbols.c b/tests/symbols.c
index 5139a65..8920187 100644
--- a/tests/symbols.c
+++ b/tests/symbols.c
@@ -131,7 +131,10 @@
   &ZDICT_isError,
   &ZDICT_getErrorName,
 /* zdict.h: advanced functions */
-  &ZDICT_trainFromBuffer_advanced,
+  &ZDICT_trainFromBuffer_cover,
+  &ZDICT_optimizeTrainFromBuffer_cover,
+  &ZDICT_finalizeDictionary,
+  &ZDICT_trainFromBuffer_legacy,
   &ZDICT_addEntropyTablesFromBuffer,
   NULL,
 };
diff --git a/tests/zstreamtest.c b/tests/zstreamtest.c
index 0e14fd2..f16cc4b 100644
--- a/tests/zstreamtest.c
+++ b/tests/zstreamtest.c
@@ -210,9 +210,9 @@
     /* context size functions */
     DISPLAYLEVEL(3, "test%3i : estimate CStream size : ", testNb++);
     {   ZSTD_compressionParameters const cParams = ZSTD_getCParams(1, CNBufferSize, dictSize);
-        size_t const s = ZSTD_estimateCStreamSize(cParams)
+        size_t const s = ZSTD_estimateCStreamSize_advanced(cParams)
                         /* uses ZSTD_initCStream_usingDict() */
-                       + ZSTD_estimateCDictSize(cParams, dictSize, 0);
+                       + ZSTD_estimateCDictSize_advanced(dictSize, cParams, 0);
             if (ZSTD_isError(s)) goto _output_error;
             DISPLAYLEVEL(3, "OK (%u bytes) \n", (U32)s);
     }
@@ -285,8 +285,8 @@
         const void* cStart = (char*)compressedBuffer + (skippableFrameSize + 8);
         size_t const gfhError = ZSTD_getFrameHeader(&fhi, cStart, cSize);
         if (gfhError!=0) goto _output_error;
-        DISPLAYLEVEL(5, " (windowSize : %u) ", fhi.windowSize);
-        {   size_t const s = ZSTD_estimateDStreamSize(fhi)
+        DISPLAYLEVEL(5, " (windowSize : %u) ", (U32)fhi.windowSize);
+        {   size_t const s = ZSTD_estimateDStreamSize(fhi.windowSize)
                             /* uses ZSTD_initDStream_usingDict() */
                            + ZSTD_estimateDDictSize(dictSize, 0);
             if (ZSTD_isError(s)) goto _output_error;
@@ -415,7 +415,7 @@
 
     /* CDict scenario */
     DISPLAYLEVEL(3, "test%3i : digested dictionary : ", testNb++);
-    {   ZSTD_CDict* const cdict = ZSTD_createCDict(dictionary.start, dictionary.filled, 1);
+    {   ZSTD_CDict* const cdict = ZSTD_createCDict(dictionary.start, dictionary.filled, 1 /*byRef*/ );
         size_t const initError = ZSTD_initCStream_usingCDict(zc, cdict);
         if (ZSTD_isError(initError)) goto _output_error;
         cSize = 0;
@@ -522,6 +522,55 @@
         DISPLAYLEVEL(3, "OK (%s)\n", ZSTD_getErrorName(r));
     }
 
+    DISPLAYLEVEL(3, "test%3i : compress with ZSTD_CCtx_refPrefix : ", testNb++);
+    { size_t const refErr = ZSTD_CCtx_refPrefix(zc, dictionary.start, dictionary.filled);
+      if (ZSTD_isError(refErr)) goto _output_error; }
+    outBuff.dst = compressedBuffer;
+    outBuff.size = compressedBufferSize;
+    outBuff.pos = 0;
+    inBuff.src = CNBuffer;
+    inBuff.size = CNBufferSize;
+    inBuff.pos = 0;
+    { size_t const r = ZSTD_compress_generic(zc, &outBuff, &inBuff, ZSTD_e_end);
+      if (ZSTD_isError(r)) goto _output_error; }
+    if (inBuff.pos != inBuff.size) goto _output_error;  /* entire input should be consumed */
+    cSize = outBuff.pos;
+    DISPLAYLEVEL(3, "OK (%u bytes : %.2f%%)\n", (U32)cSize, (double)cSize/CNBufferSize*100);
+
+    DISPLAYLEVEL(3, "test%3i : decompress with dictionary : ", testNb++);
+    {   size_t const r = ZSTD_decompress_usingDict(zd,
+                                        decodedBuffer, CNBufferSize,
+                                        compressedBuffer, cSize,
+                                        dictionary.start, dictionary.filled);
+        if (ZSTD_isError(r)) goto _output_error;  /* must fail : dictionary not used */
+        DISPLAYLEVEL(3, "OK \n");
+    }
+
+    DISPLAYLEVEL(3, "test%3i : decompress without dictionary (should fail): ", testNb++);
+    {   size_t const r = ZSTD_decompress(decodedBuffer, CNBufferSize, compressedBuffer, cSize);
+        if (!ZSTD_isError(r)) goto _output_error;  /* must fail : dictionary not used */
+        DISPLAYLEVEL(3, "OK (%s)\n", ZSTD_getErrorName(r));
+    }
+
+    DISPLAYLEVEL(3, "test%3i : compress again with ZSTD_compress_generic : ", testNb++);
+    outBuff.dst = compressedBuffer;
+    outBuff.size = compressedBufferSize;
+    outBuff.pos = 0;
+    inBuff.src = CNBuffer;
+    inBuff.size = CNBufferSize;
+    inBuff.pos = 0;
+    { size_t const r = ZSTD_compress_generic(zc, &outBuff, &inBuff, ZSTD_e_end);
+      if (ZSTD_isError(r)) goto _output_error; }
+    if (inBuff.pos != inBuff.size) goto _output_error;  /* entire input should be consumed */
+    cSize = outBuff.pos;
+    DISPLAYLEVEL(3, "OK (%u bytes : %.2f%%)\n", (U32)cSize, (double)cSize/CNBufferSize*100);
+
+    DISPLAYLEVEL(3, "test%3i : decompress without dictionary (should work): ", testNb++);
+    {   size_t const r = ZSTD_decompress(decodedBuffer, CNBufferSize, compressedBuffer, cSize);
+        if (ZSTD_isError(r)) goto _output_error;  /* must fail : dictionary not used */
+        DISPLAYLEVEL(3, "OK \n");
+    }
+
     /* Empty srcSize */
     DISPLAYLEVEL(3, "test%3i : ZSTD_initCStream_advanced with pledgedSrcSize=0 and dict : ", testNb++);
     {   ZSTD_parameters params = ZSTD_getParams(5, 0, 0);
@@ -1266,11 +1315,11 @@
         }
 
         /* compression init */
+        CHECK_Z( ZSTD_CCtx_loadDictionary(zc, NULL, 0) );   /* cancel previous dict /*/
         if ((FUZ_rand(&lseed)&1) /* at beginning, to keep same nb of rand */
             && oldTestLog /* at least one test happened */ && resetAllowed) {
             maxTestSize = FUZ_randomLength(&lseed, oldTestLog+2);
             if (maxTestSize >= srcBufferSize) maxTestSize = srcBufferSize-1;
-            CHECK_Z( ZSTD_CCtx_loadDictionary(zc, NULL, 0) );
             {   int const compressionLevel = (FUZ_rand(&lseed) % 5) + 1;
                 CHECK_Z( ZSTD_CCtx_setParameter(zc, ZSTD_p_compressionLevel, compressionLevel) );
             }
@@ -1294,7 +1343,6 @@
                 ZSTD_compressionParameters cParams = ZSTD_getCParams(cLevel, pledgedSrcSize, dictSize);
 
                 /* mess with compression parameters */
-                CHECK_Z( ZSTD_CCtx_loadDictionary(zc, NULL, 0) );   /* always cancel previous dict, to make user it's possible to pass compression parameters */
                 cParams.windowLog += (FUZ_rand(&lseed) & 3) - 1;
                 cParams.hashLog += (FUZ_rand(&lseed) & 3) - 1;
                 cParams.chainLog += (FUZ_rand(&lseed) & 3) - 1;
@@ -1311,12 +1359,15 @@
                 if (FUZ_rand(&lseed) & 1) CHECK_Z( ZSTD_CCtx_setParameter(zc, ZSTD_p_targetLength, cParams.targetLength) );
 
                 /* unconditionally set, to be sync with decoder */
-                CHECK_Z( ZSTD_CCtx_loadDictionary(zc, dict, dictSize) );
-
-                if (dict && dictSize) {
-                    /* test that compression parameters are correctly rejected after setting a dictionary */
-                    size_t const setError = ZSTD_CCtx_setParameter(zc, ZSTD_p_windowLog, cParams.windowLog-1) ;
-                    CHECK(!ZSTD_isError(setError), "ZSTD_CCtx_setParameter should have failed");
+                if (FUZ_rand(&lseed) & 1) CHECK_Z( ZSTD_CCtx_setParameter(zc, ZSTD_p_refDictContent, FUZ_rand(&lseed) & 1) );
+                if (FUZ_rand(&lseed) & 1) {
+                    CHECK_Z( ZSTD_CCtx_loadDictionary(zc, dict, dictSize) );
+                    if (dict && dictSize) {
+                        /* test that compression parameters are rejected (correctly) after loading a non-NULL dictionary */
+                        size_t const setError = ZSTD_CCtx_setParameter(zc, ZSTD_p_windowLog, cParams.windowLog-1) ;
+                        CHECK(!ZSTD_isError(setError), "ZSTD_CCtx_setParameter should have failed");
+                }   } else {
+                    CHECK_Z( ZSTD_CCtx_refPrefix(zc, dict, dictSize) );
                 }
 
                 /* mess with frame parameters */
@@ -1380,8 +1431,10 @@
 
         /* multi - fragments decompression test */
         if (!dictSize /* don't reset if dictionary : could be different */ && (FUZ_rand(&lseed) & 1)) {
+            DISPLAYLEVEL(5, "resetting DCtx (dict:%08X) \n", (U32)(size_t)dict);
             CHECK_Z( ZSTD_resetDStream(zd) );
         } else {
+            DISPLAYLEVEL(5, "using dict of size %u \n", (U32)dictSize);
             CHECK_Z( ZSTD_initDStream_usingDict(zd, dict, dictSize) );
         }
         {   size_t decompressionResult = 1;
@@ -1393,7 +1446,8 @@
                 size_t const dstBuffSize = MIN(dstBufferSize - totalGenSize, randomDstSize);
                 inBuff.size = inBuff.pos + readCSrcSize;
                 outBuff.size = inBuff.pos + dstBuffSize;
-                DISPLAYLEVEL(5, "ZSTD_decompressStream input %u bytes \n", (U32)readCSrcSize);
+                DISPLAYLEVEL(5, "ZSTD_decompressStream input %u bytes (pos:%u/%u)\n",
+                            (U32)readCSrcSize, (U32)inBuff.pos, (U32)cSize);
                 decompressionResult = ZSTD_decompressStream(zd, &outBuff, &inBuff);
                 CHECK (ZSTD_isError(decompressionResult), "decompression error : %s", ZSTD_getErrorName(decompressionResult));
                 DISPLAYLEVEL(5, "inBuff.pos = %u \n", (U32)readCSrcSize);