Updated FSE lib
diff --git a/lib/fse.c b/lib/fse.c
index a81e2f3..b0318f1 100644
--- a/lib/fse.c
+++ b/lib/fse.c
@@ -241,14 +241,11 @@
     int  deltaFindState;
     U16  maxState;
     BYTE minBitsOut;
-    /* one byte padding */
+    /* one byte padding ; total 8 bytes */
 } FSE_symbolCompressionTransform;
 
-typedef struct
-{
-    U32 fakeTable[FSE_CTABLE_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)];   /* compatible with FSE_compressU16() */
-} CTable_max_t;
-
+typedef U32 CTable_max_t[FSE_CTABLE_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)];
+typedef U32 DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
 
 /****************************************************************
 *  Internal functions
@@ -299,7 +296,7 @@
 /****************************************************************
 *  Header bitstream management
 ****************************************************************/
-size_t FSE_headerBound(unsigned maxSymbolValue, unsigned tableLog)
+size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
 {
     size_t maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 1;
     return maxSymbolValue ? maxHeaderSize : FSE_MAX_HEADERSIZE;
@@ -307,7 +304,7 @@
 
 #ifndef __clang_analyzer__   /* clang static analyzer has difficulties with this function : seems to believe normalizedCounter is uninitialized */
 
-static size_t FSE_writeHeader_generic (void* header, size_t headerBufferSize,
+static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
                                        const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
                                        unsigned safeWrite)
 {
@@ -405,22 +402,23 @@
 #endif // __clang_analyzer__
 
 
-size_t FSE_writeHeader (void* header, size_t headerBufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+size_t FSE_writeNCount (void* header, size_t headerBufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
 {
     if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported */
     if (tableLog < FSE_MIN_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported */
 
-    if (headerBufferSize < FSE_headerBound(maxSymbolValue, tableLog))
-        return FSE_writeHeader_generic(header, headerBufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
+    if (headerBufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
+        return FSE_writeNCount_generic(header, headerBufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
 
-    return FSE_writeHeader_generic(header, headerBufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
+    return FSE_writeNCount_generic(header, headerBufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
 }
 
 
-size_t FSE_readHeader (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
                  const void* headerBuffer, size_t hbSize)
 {
     const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
     const BYTE* ip = istart;
     int nbBits;
     int remaining;
@@ -430,6 +428,7 @@
     unsigned charnum = 0;
     int previous0 = 0;
 
+    if (hbSize < 4) return (size_t)-FSE_ERROR_srcSize_wrong;
     bitStream = FSE_readLE32(ip);
     nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
     if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return (size_t)-FSE_ERROR_tableLog_tooLarge;
@@ -459,7 +458,7 @@
             }
             n0 += bitStream & 3;
             bitCount += 2;
-            if (n0 > *maxSVPtr) return (size_t)-FSE_ERROR_GENERIC;
+            if (n0 > *maxSVPtr) return (size_t)-FSE_ERROR_maxSymbolValue_tooSmall;
             while (charnum < n0) normalizedCounter[charnum++] = 0;
             ip += bitCount>>3;
             bitCount &= 7;
@@ -491,16 +490,27 @@
                 threshold >>= 1;
             }
 
-            ip += bitCount>>3;
-            bitCount &= 7;
-            bitStream = FSE_readLE32(ip) >> bitCount;
+            {
+                const BYTE* itarget = ip + (bitCount>>3);
+                if (itarget > iend - 4)
+                {
+                    ip = iend - 4;
+                    bitCount -= (int)(8 * (iend - 4 - ip));
+                }
+                else
+                {
+                    ip = itarget;
+                    bitCount &= 7;
+                }
+                bitStream = FSE_readLE32(ip) >> (bitCount & 31);
+            }
         }
     }
     if (remaining != 1) return (size_t)-FSE_ERROR_GENERIC;
     *maxSVPtr = charnum-1;
 
-    ip += bitCount>0;
-    if ((size_t)(ip-istart) >= hbSize) return (size_t)-FSE_ERROR_srcSize_wrong;   /* arguably a bit late , tbd */
+    ip += (bitCount+7)>>3;
+    if ((size_t)(ip-istart) > hbSize) return (size_t)-FSE_ERROR_srcSize_wrong;
     return ip-istart;
 }
 
@@ -509,7 +519,7 @@
 *  FSE Compression Code
 ****************************************************************/
 /*
-CTable is a variable size structure which contains :
+FSE_CTable[0] is a variable size structure which contains :
     U16 tableLog;
     U16 maxSymbolValue;
     U16 nextStateNumber[1 << tableLog];                         // This size is variable
@@ -526,17 +536,17 @@
     return size;
 }
 
-void* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
+FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
 {
     size_t size;
     if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
     size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
-    return malloc(size);
+    return (FSE_CTable*)malloc(size);
 }
 
-void  FSE_freeCTable (void* CTable)
+void  FSE_freeCTable (FSE_CTable* ct)
 {
-    free(CTable);
+    free(ct);
 }
 
 
@@ -674,7 +684,7 @@
         U32 maxV = 0, maxC =0;
         for (s=0; s<=maxSymbolValue; s++)
             if (count[s] > maxC) maxV=s, maxC=count[s];
-        norm[maxV] += ToDistribute;
+        norm[maxV] += (short)ToDistribute;
         return 0;
     }
 
@@ -782,19 +792,18 @@
 }
 
 
-/* fake CTable, for raw (uncompressed) input */
-size_t FSE_buildCTable_raw (void* CTable, unsigned nbBits)
+/* fake FSE_CTable, for raw (uncompressed) input */
+size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
 {
     const unsigned tableSize = 1 << nbBits;
     const unsigned tableMask = tableSize - 1;
     const unsigned maxSymbolValue = tableMask;
-    U16* tableU16 = ( (U16*) CTable) + 2;
-    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) ((((U32*)CTable)+1) + (tableSize>>1));
+    U16* tableU16 = ( (U16*) ct) + 2;
+    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) ((((U32*)ct)+1) + (tableSize>>1));
     unsigned s;
 
     /* Sanity checks */
     if (nbBits < 1) return (size_t)-FSE_ERROR_GENERIC;             /* min size */
-    if (((size_t)CTable) & 3) return (size_t)-FSE_ERROR_GENERIC;   /* Must be allocated of 4 bytes boundaries */
 
     /* header */
     tableU16[-2] = (U16) nbBits;
@@ -816,15 +825,12 @@
 }
 
 
-/* fake CTable, for rle (100% always same symbol) input */
-size_t FSE_buildCTable_rle (void* CTable, BYTE symbolValue)
+/* fake FSE_CTable, for rle (100% always same symbol) input */
+size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
 {
     const unsigned tableSize = 1;
-    U16* tableU16 = ( (U16*) CTable) + 2;
-    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) ((U32*)CTable + 2);
-
-    /* safety checks */
-    if (((size_t)CTable) & 3) return (size_t)-FSE_ERROR_GENERIC;   /* Must be 4 bytes aligned */
+    U16* tableU16 = ( (U16*) ct) + 2;
+    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) ((U32*)ct + 2);
 
     /* header */
     tableU16[-2] = (U16) 0;
@@ -853,12 +859,12 @@
     bitC->ptr = bitC->startPtr;
 }
 
-void FSE_initCState(FSE_CState_t* statePtr, const void* CTable)
+void FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct)
 {
-    const U32 tableLog = ( (const U16*) CTable) [0];
+    const U32 tableLog = ( (const U16*) ct) [0];
     statePtr->value = (ptrdiff_t)1<<tableLog;
-    statePtr->stateTable = ((const U16*) CTable) + 2;
-    statePtr->symbolTT = (const U32*)CTable + 1 + (tableLog ? (1<<(tableLog-1)) : 1);
+    statePtr->stateTable = ((const U16*) ct) + 2;
+    statePtr->symbolTT = (const FSE_symbolCompressionTransform*)((const U32*)ct + 1 + (tableLog ? (1<<(tableLog-1)) : 1));
     statePtr->stateLog = tableLog;
 }
 
@@ -869,14 +875,14 @@
     bitC->bitPos += nbBits;
 }
 
-void FSE_encodeByte(FSE_CStream_t* bitC, FSE_CState_t* statePtr, BYTE symbol)
+void FSE_encodeSymbol(FSE_CStream_t* bitC, FSE_CState_t* statePtr, BYTE symbol)
 {
-    const FSE_symbolCompressionTransform* const symbolTT = (const FSE_symbolCompressionTransform*) statePtr->symbolTT;
-    const U16* const stateTable = (const U16*) statePtr->stateTable;
-    int nbBitsOut  = symbolTT[symbol].minBitsOut;
-    nbBitsOut -= (int)((symbolTT[symbol].maxState - statePtr->value) >> 31);
+    const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+    const U16* const stateTable = (const U16*)(statePtr->stateTable);
+    int nbBitsOut  = symbolTT.minBitsOut;
+    nbBitsOut -= (int)((symbolTT.maxState - statePtr->value) >> 31);
     FSE_addBits(bitC, statePtr->value, nbBitsOut);
-    statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT[symbol].deltaFindState];
+    statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
 }
 
 void FSE_flushBits(FSE_CStream_t* bitC)
@@ -911,7 +917,7 @@
 
 size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
                            const void* src, size_t srcSize,
-                           const void* CTable)
+                           const FSE_CTable* ct)
 {
     const BYTE* const istart = (const BYTE*) src;
     const BYTE* ip;
@@ -924,7 +930,7 @@
     /* init */
     (void)dstSize;   /* objective : ensure it fits into dstBuffer (Todo) */
     FSE_initCStream(&bitC, dst);
-    FSE_initCState(&CState1, CTable);
+    FSE_initCState(&CState1, ct);
     CState2 = CState1;
 
     ip=iend;
@@ -932,32 +938,32 @@
     /* join to even */
     if (srcSize & 1)
     {
-        FSE_encodeByte(&bitC, &CState1, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
         FSE_flushBits(&bitC);
     }
 
     /* join to mod 4 */
     if ((sizeof(size_t)*8 > FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2))   /* test bit 2 */
     {
-        FSE_encodeByte(&bitC, &CState2, *--ip);
-        FSE_encodeByte(&bitC, &CState1, *--ip);
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
         FSE_flushBits(&bitC);
     }
 
     /* 2 or 4 encoding per loop */
     while (ip>istart)
     {
-        FSE_encodeByte(&bitC, &CState2, *--ip);
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
 
         if (sizeof(size_t)*8 < FSE_MAX_TABLELOG*2+7 )   /* this test must be static */
             FSE_flushBits(&bitC);
 
-        FSE_encodeByte(&bitC, &CState1, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
 
         if (sizeof(size_t)*8 > FSE_MAX_TABLELOG*4+7 )   /* this test must be static */
         {
-            FSE_encodeByte(&bitC, &CState2, *--ip);
-            FSE_encodeByte(&bitC, &CState1, *--ip);
+            FSE_encodeSymbol(&bitC, &CState2, *--ip);
+            FSE_encodeSymbol(&bitC, &CState1, *--ip);
         }
 
         FSE_flushBits(&bitC);
@@ -983,7 +989,7 @@
 
     U32   count[FSE_MAX_SYMBOL_VALUE+1];
     S16   norm[FSE_MAX_SYMBOL_VALUE+1];
-    CTable_max_t CTable;
+    CTable_max_t ct;
     size_t errorCode;
 
     /* early out */
@@ -993,7 +999,7 @@
     if (!tableLog) tableLog = FSE_DEFAULT_TABLELOG;
 
     /* Scan input and build symbol stats */
-    errorCode = FSE_count (count, ip, srcSize, &maxSymbolValue);
+    errorCode = FSE_count (count, &maxSymbolValue, ip, srcSize);
     if (FSE_isError(errorCode)) return errorCode;
     if (errorCode == srcSize) return 1;
     if (errorCode < (srcSize >> 7)) return 0;   /* Heuristic : not compressible enough */
@@ -1003,14 +1009,14 @@
     if (FSE_isError(errorCode)) return errorCode;
 
     /* Write table description header */
-    errorCode = FSE_writeHeader (op, FSE_MAX_HEADERSIZE, norm, maxSymbolValue, tableLog);
+    errorCode = FSE_writeNCount (op, FSE_MAX_HEADERSIZE, norm, maxSymbolValue, tableLog);
     if (FSE_isError(errorCode)) return errorCode;
     op += errorCode;
 
     /* Compress */
-    errorCode = FSE_buildCTable (&CTable, norm, maxSymbolValue, tableLog);
+    errorCode = FSE_buildCTable (ct, norm, maxSymbolValue, tableLog);
     if (FSE_isError(errorCode)) return errorCode;
-    op += FSE_compress_usingCTable(op, oend - op, ip, srcSize, &CTable);
+    op += FSE_compress_usingCTable(op, oend - op, ip, srcSize, ct);
 
     /* check compressibility */
     if ( (size_t)(op-ostart) >= srcSize-1 )
@@ -1036,24 +1042,12 @@
     BYTE nbBits;
 } FSE_decode_t;   /* size == U32 */
 
-/* Specific corner case : RLE compression */
-size_t FSE_decompressRLE(void* dst, size_t originalSize,
-                   const void* cSrc, size_t cSrcSize)
-{
-    if (cSrcSize != 1) return (size_t)-FSE_ERROR_srcSize_wrong;
-    memset(dst, *(const BYTE*)cSrc, originalSize);
-    return originalSize;
-}
 
-
-size_t FSE_buildDTable_rle (void* DTable, BYTE symbolValue)
+size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
 {
-    U32* const base32 = (U32*)DTable;
+    U32* const base32 = (U32*)dt;
     FSE_decode_t* const cell = (FSE_decode_t*)(base32 + 1);
 
-    /* Sanity check */
-    if (((size_t)DTable) & 3) return (size_t)-FSE_ERROR_GENERIC;   /* Must be allocated of 4 bytes boundaries */
-
     base32[0] = 0;
 
     cell->newState = 0;
@@ -1064,9 +1058,9 @@
 }
 
 
-size_t FSE_buildDTable_raw (void* DTable, unsigned nbBits)
+size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
 {
-    U32* const base32 = (U32*)DTable;
+    U32* const base32 = (U32*)dt;
     FSE_decode_t* dinfo = (FSE_decode_t*)(base32 + 1);
     const unsigned tableSize = 1 << nbBits;
     const unsigned tableMask = tableSize - 1;
@@ -1075,7 +1069,6 @@
 
     /* Sanity checks */
     if (nbBits < 1) return (size_t)-FSE_ERROR_GENERIC;             /* min size */
-    if (((size_t)DTable) & 3) return (size_t)-FSE_ERROR_GENERIC;   /* Must be allocated of 4 bytes boundaries */
 
     /* Build Decoding Table */
     base32[0] = nbBits;
@@ -1100,11 +1093,11 @@
 {
     if (srcSize < 1) return (size_t)-FSE_ERROR_srcSize_wrong;
 
-    if (srcSize >=  sizeof(bitD_t))
+    if (srcSize >=  sizeof(size_t))
     {
         U32 contain32;
         bitD->start = (const char*)srcBuffer;
-        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD_t);
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(size_t);
         bitD->bitContainer = FSE_readLEST(bitD->ptr);
         contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
         if (contain32 == 0) return (size_t)-FSE_ERROR_GENERIC;   /* stop bit not present */
@@ -1118,18 +1111,18 @@
         bitD->bitContainer = *(const BYTE*)(bitD->start);
         switch(srcSize)
         {
-            case 7: bitD->bitContainer += (bitD_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(bitD_t)*8 - 16);
-            case 6: bitD->bitContainer += (bitD_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(bitD_t)*8 - 24);
-            case 5: bitD->bitContainer += (bitD_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(bitD_t)*8 - 32);
-            case 4: bitD->bitContainer += (bitD_t)(((const BYTE*)(bitD->start))[3]) << 24;
-            case 3: bitD->bitContainer += (bitD_t)(((const BYTE*)(bitD->start))[2]) << 16;
-            case 2: bitD->bitContainer += (bitD_t)(((const BYTE*)(bitD->start))[1]) <<  8;
+            case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16);
+            case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24);
+            case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32);
+            case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24;
+            case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16;
+            case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) <<  8;
             default:;
         }
         contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
         if (contain32 == 0) return (size_t)-FSE_ERROR_GENERIC;   /* stop bit not present */
         bitD->bitsConsumed = 8 - FSE_highbit32(contain32);
-        bitD->bitsConsumed += (U32)(sizeof(bitD_t) - srcSize)*8;
+        bitD->bitsConsumed += (U32)(sizeof(size_t) - srcSize)*8;
     }
 
     return srcSize;
@@ -1143,23 +1136,23 @@
  * Use the fast variant *only* if n >= 1.
  * return : value extracted.
  */
-bitD_t FSE_readBits(FSE_DStream_t* bitD, U32 nbBits)
+size_t FSE_readBits(FSE_DStream_t* bitD, U32 nbBits)
 {
-    bitD_t value = ((bitD->bitContainer << (bitD->bitsConsumed & ((sizeof(bitD_t)*8)-1))) >> 1) >> (((sizeof(bitD_t)*8)-1)-nbBits);
+    size_t value = ((bitD->bitContainer << (bitD->bitsConsumed & ((sizeof(size_t)*8)-1))) >> 1) >> (((sizeof(size_t)*8)-1)-nbBits);
     bitD->bitsConsumed += nbBits;
     return value;
 }
 
-bitD_t FSE_readBitsFast(FSE_DStream_t* bitD, U32 nbBits)   /* only if nbBits >= 1 !! */
+size_t FSE_readBitsFast(FSE_DStream_t* bitD, U32 nbBits)   /* only if nbBits >= 1 !! */
 {
-    bitD_t value = (bitD->bitContainer << bitD->bitsConsumed) >> ((sizeof(bitD_t)*8)-nbBits);
+    size_t value = (bitD->bitContainer << bitD->bitsConsumed) >> ((sizeof(size_t)*8)-nbBits);
     bitD->bitsConsumed += nbBits;
     return value;
 }
 
 unsigned FSE_reloadDStream(FSE_DStream_t* bitD)
 {
-    if (bitD->ptr >= bitD->start + sizeof(bitD_t))
+    if (bitD->ptr >= bitD->start + sizeof(size_t))
     {
         bitD->ptr -= bitD->bitsConsumed >> 3;
         bitD->bitsConsumed &= 7;
@@ -1168,8 +1161,8 @@
     }
     if (bitD->ptr == bitD->start)
     {
-        if (bitD->bitsConsumed < sizeof(bitD_t)*8) return 1;
-        if (bitD->bitsConsumed == sizeof(bitD_t)*8) return 2;
+        if (bitD->bitsConsumed < sizeof(size_t)*8) return 1;
+        if (bitD->bitsConsumed == sizeof(size_t)*8) return 2;
         return 3;
     }
     {
@@ -1184,9 +1177,9 @@
 }
 
 
-void FSE_initDState(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD, const void* DTable)
+void FSE_initDState(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD, const FSE_DTable* dt)
 {
-    const U32* const base32 = (const U32*)DTable;
+    const U32* const base32 = (const U32*)dt;
     DStatePtr->state = FSE_readBits(bitD, base32[0]);
     FSE_reloadDStream(bitD);
     DStatePtr->table = base32 + 1;
@@ -1197,7 +1190,7 @@
     const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
     const U32  nbBits = DInfo.nbBits;
     BYTE symbol = DInfo.symbol;
-    bitD_t lowBits = FSE_readBits(bitD, nbBits);
+    size_t lowBits = FSE_readBits(bitD, nbBits);
 
     DStatePtr->state = DInfo.newState + lowBits;
     return symbol;
@@ -1208,7 +1201,7 @@
     const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
     const U32 nbBits = DInfo.nbBits;
     BYTE symbol = DInfo.symbol;
-    bitD_t lowBits = FSE_readBitsFast(bitD, nbBits);
+    size_t lowBits = FSE_readBitsFast(bitD, nbBits);
 
     DStatePtr->state = DInfo.newState + lowBits;
     return symbol;
@@ -1219,19 +1212,19 @@
 
 unsigned FSE_endOfDStream(const FSE_DStream_t* bitD)
 {
-    return ((bitD->ptr == bitD->start) && (bitD->bitsConsumed == sizeof(bitD_t)*8));
+    return ((bitD->ptr == bitD->start) && (bitD->bitsConsumed == sizeof(size_t)*8));
 }
 
-unsigned FSE_endOfDState(const FSE_DState_t* statePtr)
+unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
 {
-    return statePtr->state == 0;
+    return DStatePtr->state == 0;
 }
 
 
 FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
           void* dst, size_t maxDstSize,
     const void* cSrc, size_t cSrcSize,
-    const void* DTable, unsigned fast)
+    const FSE_DTable* dt, unsigned fast)
 {
     BYTE* const ostart = (BYTE*) dst;
     BYTE* op = ostart;
@@ -1239,15 +1232,16 @@
     BYTE* const olimit = omax-3;
 
     FSE_DStream_t bitD;
-    FSE_DState_t state1, state2;
+    FSE_DState_t state1;
+    FSE_DState_t state2;
     size_t errorCode;
 
     /* Init */
     errorCode = FSE_initDStream(&bitD, cSrc, cSrcSize);   /* replaced last arg by maxCompressed Size */
     if (FSE_isError(errorCode)) return errorCode;
 
-    FSE_initDState(&state1, &bitD, DTable);
-    FSE_initDState(&state2, &bitD, DTable);
+    FSE_initDState(&state1, &bitD, dt);
+    FSE_initDState(&state2, &bitD, dt);
 
 
     /* 2 symbols per loop */
@@ -1255,12 +1249,12 @@
     {
         *op++ = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
 
-        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD_t)*8)    /* This test must be static */
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(size_t)*8)    /* This test must be static */
             FSE_reloadDStream(&bitD);
 
         *op++ = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
 
-        if (FSE_MAX_TABLELOG*4+7 < sizeof(bitD_t)*8)    /* This test must be static */
+        if (FSE_MAX_TABLELOG*4+7 < sizeof(size_t)*8)    /* This test must be static */
         {
             *op++ = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
             *op++ = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
@@ -1294,11 +1288,11 @@
 
 size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
                             const void* cSrc, size_t cSrcSize,
-                            const void* DTable, size_t fastMode)
+                            const FSE_DTable* dt, size_t fastMode)
 {
     /* select fast mode (static) */
-    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, DTable, 1);
-    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, DTable, 0);
+    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
 }
 
 
@@ -1307,25 +1301,25 @@
     const BYTE* const istart = (const BYTE*)cSrc;
     const BYTE* ip = istart;
     short counting[FSE_MAX_SYMBOL_VALUE+1];
-    FSE_decode_t DTable[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
-    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
     unsigned tableLog;
+    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
     size_t errorCode, fastMode;
 
     if (cSrcSize<2) return (size_t)-FSE_ERROR_srcSize_wrong;   /* too small input size */
 
     /* normal FSE decoding mode */
-    errorCode = FSE_readHeader (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+    errorCode = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
     if (FSE_isError(errorCode)) return errorCode;
     if (errorCode >= cSrcSize) return (size_t)-FSE_ERROR_srcSize_wrong;   /* too small input size */
     ip += errorCode;
     cSrcSize -= errorCode;
 
-    fastMode = FSE_buildDTable (DTable, counting, maxSymbolValue, tableLog);
+    fastMode = FSE_buildDTable (dt, counting, maxSymbolValue, tableLog);
     if (FSE_isError(fastMode)) return fastMode;
 
     /* always return, even if it is an error code */
-    return FSE_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, DTable, fastMode);
+    return FSE_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt, fastMode);
 }
 
 
@@ -1334,8 +1328,8 @@
 /*
   2nd part of the file
   designed to be included
-  for type-specific functions (template equivalent in C)
-  Objective is to write such functions only once, for better maintenance
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
 */
 
 /* safety checks */
@@ -1353,7 +1347,8 @@
 
 
 /* Function templates */
-size_t FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION) (unsigned* count, const FSE_FUNCTION_TYPE* source, size_t sourceSize, unsigned* maxSymbolValuePtr, unsigned safe)
+size_t FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION)
+(unsigned* count, unsigned* maxSymbolValuePtr, const FSE_FUNCTION_TYPE* source, size_t sourceSize, unsigned safe)
 {
     const FSE_FUNCTION_TYPE* ip = source;
     const FSE_FUNCTION_TYPE* const iend = ip+sourceSize;
@@ -1427,48 +1422,45 @@
 
     while (!count[maxSymbolValue]) maxSymbolValue--;
     *maxSymbolValuePtr = maxSymbolValue;
-    return (int)max;
+    return (size_t)max;
 }
 
 /* hidden fast variant (unsafe) */
-size_t FSE_FUNCTION_NAME(FSE_countFast, FSE_FUNCTION_EXTENSION) (unsigned* count, const FSE_FUNCTION_TYPE* source, size_t sourceSize, unsigned* maxSymbolValuePtr)
+size_t FSE_FUNCTION_NAME(FSE_countFast, FSE_FUNCTION_EXTENSION)
+(unsigned* count, unsigned* maxSymbolValuePtr, const FSE_FUNCTION_TYPE* source, size_t sourceSize)
 {
-    return FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION) (count, source, sourceSize, maxSymbolValuePtr, 0);
+    return FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION) (count, maxSymbolValuePtr, source, sourceSize, 0);
 }
 
-size_t FSE_FUNCTION_NAME(FSE_count, FSE_FUNCTION_EXTENSION) (unsigned* count, const FSE_FUNCTION_TYPE* source, size_t sourceSize, unsigned* maxSymbolValuePtr)
+size_t FSE_FUNCTION_NAME(FSE_count, FSE_FUNCTION_EXTENSION)
+(unsigned* count, unsigned* maxSymbolValuePtr, const FSE_FUNCTION_TYPE* source, size_t sourceSize)
 {
     if ((sizeof(FSE_FUNCTION_TYPE)==1) && (*maxSymbolValuePtr >= 255))
     {
         *maxSymbolValuePtr = 255;
-        return FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION) (count, source, sourceSize, maxSymbolValuePtr, 0);
+        return FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION) (count, maxSymbolValuePtr, source, sourceSize, 0);
     }
-    return FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION) (count, source, sourceSize, maxSymbolValuePtr, 1);
+    return FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION) (count, maxSymbolValuePtr, source, sourceSize, 1);
 }
 
 
-#ifndef __clang_analyzer__   /* clang static analyzer doesn't understand that tableSymbol is necessarily entirely initialized */
-
 static U32 FSE_tableStep(U32 tableSize) { return (tableSize>>1) + (tableSize>>3) + 3; }
 
 size_t FSE_FUNCTION_NAME(FSE_buildCTable, FSE_FUNCTION_EXTENSION)
-(void* CTable, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
 {
     const unsigned tableSize = 1 << tableLog;
     const unsigned tableMask = tableSize - 1;
-    U16* tableU16 = ( (U16*) CTable) + 2;
-    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) (((U32*)CTable) + 1 + (tableLog ? tableSize>>1 : 1) );
+    U16* tableU16 = ( (U16*) ct) + 2;
+    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) (((U32*)ct) + 1 + (tableLog ? tableSize>>1 : 1) );
     const unsigned step = FSE_tableStep(tableSize);
     unsigned cumul[FSE_MAX_SYMBOL_VALUE+2];
     U32 position = 0;
-    FSE_FUNCTION_TYPE tableSymbol[FSE_MAX_TABLESIZE];
+    FSE_FUNCTION_TYPE tableSymbol[FSE_MAX_TABLESIZE] = {0};   /* should not be necessary, but analyzer complain without it, and performance loss is negligible with it */
     U32 highThreshold = tableSize-1;
     unsigned symbol;
     unsigned i;
 
-    /* safety checks */
-    if (((size_t)CTable) & 3) return (size_t)-FSE_ERROR_GENERIC;   /* Must be allocated of 4 bytes boundaries */
-
     /* header */
     tableU16[-2] = (U16) tableLog;
     tableU16[-1] = (U16) maxSymbolValue;
@@ -1508,10 +1500,10 @@
     for (i=0; i<tableSize; i++)
     {
         FSE_FUNCTION_TYPE s = tableSymbol[i];
-        tableU16[cumul[s]++] = (U16) (tableSize+i);   // Table U16 : sorted by symbol order; gives next state value
+        tableU16[cumul[s]++] = (U16) (tableSize+i);   /* Table U16 : sorted by symbol order; gives next state value */
     }
 
-    // Build Symbol Transformation Table
+    /* Build Symbol Transformation Table */
     {
         unsigned s;
         unsigned total = 0;
@@ -1540,27 +1532,25 @@
     return 0;
 }
 
-#endif // __clang_analyzer__
-
 
 #define FSE_DECODE_TYPE FSE_TYPE_NAME(FSE_decode_t, FSE_FUNCTION_EXTENSION)
 
-void* FSE_FUNCTION_NAME(FSE_createDTable, FSE_FUNCTION_EXTENSION) (unsigned tableLog)
+FSE_DTable* FSE_FUNCTION_NAME(FSE_createDTable, FSE_FUNCTION_EXTENSION) (unsigned tableLog)
 {
     if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
-    return malloc( ((size_t)1<<tableLog) * sizeof (FSE_DECODE_TYPE) );
+    return (FSE_DTable*)malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
 }
 
-void FSE_FUNCTION_NAME(FSE_freeDTable, FSE_FUNCTION_EXTENSION) (void* DTable)
+void FSE_FUNCTION_NAME(FSE_freeDTable, FSE_FUNCTION_EXTENSION) (FSE_DTable* dt)
 {
-    free(DTable);
+    free(dt);
 }
 
 
 size_t FSE_FUNCTION_NAME(FSE_buildDTable, FSE_FUNCTION_EXTENSION)
-(void* DTable, const short* const normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
 {
-    U32* const base32 = (U32*)DTable;
+    U32* const base32 = (U32*)dt;
     FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (base32+1);
     const U32 tableSize = 1 << tableLog;
     const U32 tableMask = tableSize-1;
@@ -1611,7 +1601,7 @@
         U32 i;
         for (i=0; i<tableSize; i++)
         {
-            FSE_FUNCTION_TYPE symbol = tableDecode[i].symbol;
+            FSE_FUNCTION_TYPE symbol = (FSE_FUNCTION_TYPE)(tableDecode[i].symbol);
             U16 nextState = symbolNext[symbol]++;
             tableDecode[i].nbBits = (BYTE) (tableLog - FSE_highbit32 ((U32)nextState) );
             tableDecode[i].newState = (U16) ( (nextState << tableDecode[i].nbBits) - tableSize);
diff --git a/lib/fse.h b/lib/fse.h
index 1526f0f..df95d25 100644
--- a/lib/fse.h
+++ b/lib/fse.h
@@ -42,7 +42,7 @@
 /******************************************
 *  Includes
 ******************************************/
-#include <stddef.h>    // size_t, ptrdiff_t
+#include <stddef.h>    /* size_t, ptrdiff_t */
 
 
 /******************************************
@@ -50,7 +50,7 @@
 ******************************************/
 size_t FSE_compress(void* dst, size_t maxDstSize,
               const void* src, size_t srcSize);
-size_t FSE_decompress(void* dst, size_t maxDstSize,
+size_t FSE_decompress(void* dst,  size_t maxDstSize,
                 const void* cSrc, size_t cSrcSize);
 /*
 FSE_compress():
@@ -58,29 +58,19 @@
     'dst' buffer must be already allocated, and sized to handle worst case situations.
     Worst case size evaluation is provided by FSE_compressBound().
     return : size of compressed data
-    Special values : if result == 0, data is uncompressible => Nothing is stored within cSrc !!
-                     if result == 1, data is one constant element x srcSize times. Use RLE compression.
-                     if FSE_isError(result), it's an error code.
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
+                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
+                     if FSE_isError(return), it's an error code.
 
 FSE_decompress():
     Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
     into already allocated destination buffer 'dst', of size 'maxDstSize'.
-    ** Important ** : This function doesn't decompress uncompressed nor RLE data !
     return : size of regenerated data (<= maxDstSize)
              or an error code, which can be tested using FSE_isError()
-*/
 
-
-size_t FSE_decompressRLE(void* dst, size_t originalSize,
-                   const void* cSrc, size_t cSrcSize);
-/*
-FSE_decompressRLE():
-    Decompress specific RLE corner case (equivalent to memset()).
-    cSrcSize must be == 1. originalSize must be exact.
-    return : size of regenerated data (==originalSize)
-             or an error code, which can be tested using FSE_isError()
-
-Note : there is no function provided for uncompressed data, as it's just a simple memcpy()
+    ** Important ** : FSE_decompress() doesn't decompress non-compressible nor RLE data !!!
+    Why ? : making this distinction requires a header.
+    FSE library doesn't manage headers, which are intentionally left to the user layer.
 */
 
 
@@ -102,51 +92,103 @@
     Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
     Both parameters can be defined as '0' to mean : use default value
     return : size of compressed data
-             or -1 if there is an error
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
+                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
+                     if FSE_isError(return), it's an error code.
 */
 size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
 
 
 /******************************************
-   FSE detailed API
+*  FSE detailed API
 ******************************************/
 /*
-int FSE_compress(char* dest, const char* source, int inputSize) does the following:
+FSE_compress() does the following:
 1. count symbol occurrence from source[] into table count[]
 2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
 3. save normalized counters to memory buffer using writeHeader()
 4. build encoding table 'CTable' from normalized counters
-5. encode the data stream using encoding table
+5. encode the data stream using encoding table 'CTable'
 
-int FSE_decompress(char* dest, int originalSize, const char* compressed) performs:
+FSE_decompress() does the following:
 1. read normalized counters with readHeader()
 2. build decoding table 'DTable' from normalized counters
-3. decode the data stream using decoding table
+3. decode the data stream using decoding table 'DTable'
 
-The following API allows triggering specific sub-functions.
+The following API allows to trigger specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using one's own method.
 */
 
 /* *** COMPRESSION *** */
 
-size_t FSE_count(unsigned* count, const unsigned char* src, size_t srcSize, unsigned* maxSymbolValuePtr);
-
-unsigned FSE_optimalTableLog(unsigned tableLog, size_t srcSize, unsigned maxSymbolValue);
-size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog, const unsigned* count, size_t total, unsigned maxSymbolValue);
-
-size_t FSE_headerBound(unsigned maxSymbolValue, unsigned tableLog);
-size_t FSE_writeHeader (void* headerBuffer, size_t headerBufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
-
-void*  FSE_createCTable (unsigned tableLog, unsigned maxSymbolValue);
-void   FSE_freeCTable (void* CTable);
-size_t FSE_buildCTable(void* CTable, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
-
-size_t FSE_compress_usingCTable (void* dst, size_t dstSize, const void* src, size_t srcSize, const void* CTable);
+/*
+FSE_count():
+   Provides the precise count of each symbol within a table 'count'
+   'count' is a table of unsigned int, of minimum size (maxSymbolValuePtr[0]+1).
+   maxSymbolValuePtr[0] will be updated if detected smaller than initially expected
+   return : the count of the most frequent symbol (which is not identified)
+            if return == srcSize, there is only one symbol.
+            if FSE_isError(return), it's an error code. */
+size_t FSE_count(unsigned* count, unsigned* maxSymbolValuePtr, const unsigned char* src, size_t srcSize);
 
 /*
+FSE_optimalTableLog():
+   dynamically downsize 'tableLog' when conditions are met.
+   It saves CPU time, by using smaller tables, while preserving or even improving compression ratio.
+   return : recommended tableLog (necessarily <= initial 'tableLog') */
+unsigned FSE_optimalTableLog(unsigned tableLog, size_t srcSize, unsigned maxSymbolValue);
+
+/*
+FSE_normalizeCount():
+   normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
+   'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1).
+   return : tableLog,
+            or an errorCode, which can be tested using FSE_isError() */
+size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog, const unsigned* count, size_t srcSize, unsigned maxSymbolValue);
+
+/*
+FSE_NCountWriteBound():
+   Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'
+   Typically useful for allocation purpose. */
+size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog);
+
+/*
+FSE_writeNCount():
+   Compactly save 'normalizedCounter' into 'buffer'.
+   return : size of the compressed table
+            or an errorCode, which can be tested using FSE_isError() */
+size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+
+/*
+Constructor and Destructor of type FSE_CTable
+Not that its size depends on parameters 'tableLog' and 'maxSymbolValue' */
+typedef unsigned FSE_CTable;   /* don't allocate that. It's just a way to be more restrictive than void */
+FSE_CTable* FSE_createCTable (unsigned tableLog, unsigned maxSymbolValue);
+void        FSE_freeCTable (FSE_CTable* ct);
+
+/*
+FSE_buildCTable():
+   Builds 'ct', which must be already allocated, using FSE_createCTable()
+   return : 0
+            or an errorCode, which can be tested using FSE_isError() */
+size_t   FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*
+FSE_compress_usingCTable():
+   Compress 'src' using 'ct' into 'dst' which must be already allocated
+   return : size of compressed data
+            or an errorCode, which can be tested using FSE_isError() */
+size_t FSE_compress_usingCTable (void* dst, size_t dstSize, const void* src, size_t srcSize, const FSE_CTable* ct);
+
+/*
+Tutorial :
+----------
 The first step is to count all symbols. FSE_count() provides one quick way to do this job.
-Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have '*maxSymbolValuePtr+1' cells.
-'source' is a table of char of size 'sourceSize'. All values within 'src' MUST be <= *maxSymbolValuePtr
-*maxSymbolValuePtr will be updated, with its real value (necessarily <= original value)
+Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have 'maxSymbolValuePtr[0]+1' cells.
+'src' is a table of bytes of size 'srcSize'. All values within 'src' MUST be <= maxSymbolValuePtr[0]
+maxSymbolValuePtr[0] will be updated, with its real value (necessarily <= original value)
 FSE_count() will return the number of occurrence of the most frequent symbol.
 If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
 
@@ -170,211 +212,83 @@
 The result of the function is the number of bytes written into 'header'.
 If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()) (for example, buffer size too small).
 
-'normalizedCounter' can then be used to create the compression tables 'CTable'.
+'normalizedCounter' can then be used to create the compression table 'CTable'.
 The space required by 'CTable' must be already allocated. Its size is provided by FSE_sizeof_CTable().
 'CTable' must be aligned of 4 bytes boundaries.
 You can then use FSE_buildCTable() to fill 'CTable'.
 In both cases, if there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
 
-'CTable' can then be used to compress 'source', with FSE_compress_usingCTable().
-Similar to FSE_count(), the convention is that 'source' is assumed to be a table of char of size 'sourceSize'
-The function returns the size of compressed data (without header), or -1 if failed.
+'CTable' can then be used to compress 'src', with FSE_compress_usingCTable().
+Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize'
+The function returns the size of compressed data (without header).
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
 */
 
 
 /* *** DECOMPRESSION *** */
 
-size_t FSE_readHeader (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* headerBuffer, size_t hbSize);
-
-void*  FSE_createDTable(unsigned tableLog);
-void   FSE_freeDTable(void* DTable);
-size_t FSE_buildDTable (void* DTable, const short* const normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
-
-size_t FSE_decompress_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const void* DTable, size_t fastMode);
+/*
+FSE_readNCount():
+   Read compactly saved 'normalizedCounter' from 'rBuffer'.
+   return : size read from 'rBuffer'
+            or an errorCode, which can be tested using FSE_isError()
+            maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* rBuffer, size_t rBuffSize);
 
 /*
-If the block is RLE compressed, or uncompressed, use the relevant specific functions.
+Constructor and Destructor of type FSE_DTable
+Note that its size depends on parameters 'tableLog' */
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void */
+FSE_DTable* FSE_createDTable(unsigned tableLog);
+void        FSE_freeDTable(FSE_DTable* dt);
+
+/*
+FSE_buildDTable():
+   Builds 'dt', which must be already allocated, using FSE_createDTable()
+   return : 1 if 'dt' is compatible with fast mode, 0 otherwise,
+            or an errorCode, which can be tested using FSE_isError() */
+size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*
+FSE_decompress_usingDTable():
+   Decompress compressed source 'cSrc' of size 'cSrcSize'
+   using 'dt' into 'dst' which must be already allocated.
+   Use fastMode==1 only if authorized by result of FSE_buildDTable().
+   return : size of regenerated data (necessarily <= maxDstSize)
+            or an errorCode, which can be tested using FSE_isError() */
+size_t FSE_decompress_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt, size_t fastMode);
+
+/*
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
 
 The first step is to obtain the normalized frequencies of symbols.
 This can be performed by reading a header with FSE_readHeader().
-'normalizedCounter' must be already allocated, and have at least '*maxSymbolValuePtr+1' cells of short.
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of short.
 In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
 or size the table to handle worst case situations (typically 256).
 FSE_readHeader will provide 'tableLog' and 'maxSymbolValue' stored into the header.
 The result of FSE_readHeader() is the number of bytes read from 'header'.
-The following values have special meaning :
-return 2 : there is only a single symbol value. The value is provided into the second byte of header.
-return 1 : data is uncompressed
+Note that 'headerSize' must be at least 4 bytes, even if useful information is less than that.
 If there is an error, the function will return an error code, which can be tested using FSE_isError().
 
-The next step is to create the decompression tables 'DTable' from 'normalizedCounter'.
+The next step is to create the decompression tables 'FSE_DTable' from 'normalizedCounter'.
 This is performed by the function FSE_buildDTable().
-The space required by 'DTable' must be already allocated and properly aligned.
-One can create a DTable using FSE_createDTable().
-The function will return 1 if DTable is compatible with fastMode, 0 otherwise.
+The space required by 'FSE_DTable' must be already allocated using FSE_createDTable().
+The function will return 1 if FSE_DTable is compatible with fastMode, 0 otherwise.
 If there is an error, the function will return an error code, which can be tested using FSE_isError().
 
-'DTable' can then be used to decompress 'compressed', with FSE_decompress_usingDTable().
-Only trigger fastMode if it was authorized by result of FSE_buildDTable(), otherwise decompression will fail.
+'FSE_DTable' can then be used to decompress 'cSrc', with FSE_decompress_usingDTable().
+Only trigger fastMode if it was authorized by the result of FSE_buildDTable(), otherwise decompression will fail.
 cSrcSize must be correct, otherwise decompression will fail.
 FSE_decompress_usingDTable() result will tell how many bytes were regenerated.
 If there is an error, the function will return an error code, which can be tested using FSE_isError().
 */
 
 
-/******************************************
-*  FSE streaming compression API
-******************************************/
-typedef struct
-{
-    size_t bitContainer;
-    int    bitPos;
-    char*  startPtr;
-    char*  ptr;
-} FSE_CStream_t;
-
-typedef struct
-{
-    ptrdiff_t   value;
-    const void* stateTable;
-    const void* symbolTT;
-    unsigned    stateLog;
-} FSE_CState_t;
-
-void   FSE_initCStream(FSE_CStream_t* bitC, void* dstBuffer);
-void   FSE_initCState(FSE_CState_t* CStatePtr, const void* CTable);
-
-void   FSE_encodeByte(FSE_CStream_t* bitC, FSE_CState_t* CStatePtr, unsigned char symbol);
-void   FSE_addBits(FSE_CStream_t* bitC, size_t value, unsigned nbBits);
-void   FSE_flushBits(FSE_CStream_t* bitC);
-
-void   FSE_flushCState(FSE_CStream_t* bitC, const FSE_CState_t* CStatePtr);
-size_t FSE_closeCStream(FSE_CStream_t* bitC);
-
-/*
-These functions are inner components of FSE_compress_usingCTable().
-They allow creation of custom streams, mixing multiple tables and bit sources.
-
-A key property to keep in mind is that encoding and decoding are done **in reverse direction**.
-So the first symbol you will encode is the last you will decode, like a lifo stack.
-
-You will need a few variables to track your CStream. They are :
-
-void* CTable;           // Provided by FSE_buildCTable()
-FSE_CStream_t bitC;     // bitStream tracking structure
-FSE_CState_t state;     // State tracking structure
-
-
-The first thing to do is to init the bitStream, and the state.
-    FSE_initCStream(&bitC, dstBuffer);
-    FSE_initState(&state, CTable);
-
-You can then encode your input data, byte after byte.
-FSE_encodeByte() outputs a maximum of 'tableLog' bits at a time.
-Remember decoding will be done in reverse direction.
-    FSE_encodeByte(&bitStream, &state, symbol);
-
-At any time, you can add any bit sequence.
-Note : maximum allowed nbBits is 25, for compatibility with 32-bits decoders
-    FSE_addBits(&bitStream, bitField, nbBits);
-
-The above methods don't commit data to memory, they just store it into local register, for speed.
-Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
-Writing data to memory is a manual operation, performed by the flushBits function.
-    FSE_flushBits(&bitStream);
-
-Your last FSE encoding operation shall be to flush your last state value(s).
-    FSE_flushState(&bitStream, &state);
-
-You must then close the bitStream if you opened it with FSE_initCStream().
-It's possible to embed some user-info into the header, as an optionalId [0-31].
-The function returns the size in bytes of CStream.
-If there is an error, it returns an errorCode (which can be tested using FSE_isError()).
-    size_t size = FSE_closeCStream(&bitStream, optionalId);
-*/
-
-
-/******************************************
-*  FSE streaming decompression API
-******************************************/
-//typedef unsigned int bitD_t;
-typedef size_t bitD_t;
-
-typedef struct
-{
-    bitD_t   bitContainer;
-    unsigned bitsConsumed;
-    const char* ptr;
-    const char* start;
-} FSE_DStream_t;
-
-typedef struct
-{
-    bitD_t      state;
-    const void* table;
-} FSE_DState_t;
-
-
-size_t FSE_initDStream(FSE_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
-void   FSE_initDState(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD, const void* DTable);
-
-unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD);
-bitD_t        FSE_readBits(FSE_DStream_t* bitD, unsigned nbBits);
-unsigned int  FSE_reloadDStream(FSE_DStream_t* bitD);
-
-unsigned FSE_endOfDStream(const FSE_DStream_t* bitD);
-unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
-
-/*
-Let's now decompose FSE_decompress_usingDTable() into its unitary elements.
-You will decode FSE-encoded symbols from the bitStream,
-and also any other bitFields you put in, **in reverse order**.
-
-You will need a few variables to track your bitStream. They are :
-
-FSE_DStream_t DStream;    // Stream context
-FSE_DState_t DState;      // State context. Multiple ones are possible
-const void* DTable;       // Decoding table, provided by FSE_buildDTable()
-U32 tableLog;             // Provided by FSE_readHeader()
-
-The first thing to do is to init the bitStream.
-    errorCode = FSE_initDStream(&DStream, &optionalId, srcBuffer, srcSize);
-
-You should then retrieve your initial state(s) (multiple ones are possible) :
-    errorCode = FSE_initDState(&DState, &DStream, DTable, tableLog);
-
-You can then decode your data, symbol after symbol.
-For information the maximum number of bits read by FSE_decodeSymbol() is 'tableLog'.
-Keep in mind that symbols are decoded in reverse order, like a lifo stack (last in, first out).
-    unsigned char symbol = FSE_decodeSymbol(&DState, &DStream);
-
-You can retrieve any bitfield you eventually stored into the bitStream (in reverse order)
-Note : maximum allowed nbBits is 25
-    unsigned int bitField = FSE_readBits(&DStream, nbBits);
-
-All above operations only read from local register (which size is controlled by bitD_t==32 bits).
-Reading data from memory is manually performed by the reload method.
-    endSignal = FSE_reloadDStream(&DStream);
-
-FSE_reloadDStream() result tells if there is still some more data to read from DStream.
-0 : there is still some data left into the DStream.
-1 Dstream reached end of buffer, but is not yet fully extracted. It will not load data from memory any more.
-2 Dstream reached its exact end, corresponding in general to decompression completed.
-3 Dstream went too far. Decompression result is corrupted.
-
-When reaching end of buffer(1), progress slowly if you decode multiple symbols per loop,
-to properly detect the exact end of stream.
-After each decoded symbol, check if DStream is fully consumed using this simple test :
-    FSE_reloadDStream(&DStream) >= 2
-
-When it's done, verify decompression is fully completed, by checking both DStream and the relevant states.
-Checking if DStream has reached its end is performed by :
-    FSE_endOfDStream(&DStream);
-Check also the states. There might be some entropy left there, still able to decode some high probability symbol.
-    FSE_endOfDState(&DState);
-*/
-
-
 #if defined (__cplusplus)
 }
 #endif
diff --git a/lib/fse_static.h b/lib/fse_static.h
index 7d400a5..f085d1f 100644
--- a/lib/fse_static.h
+++ b/lib/fse_static.h
@@ -40,24 +40,19 @@
 
 
 /******************************************
-*  Tool functions
+*  FSE API compatible with DLL
 ******************************************/
-#define FSE_MAX_HEADERSIZE 512
-#define FSE_COMPRESSBOUND(size) (size + (size>>7) + FSE_MAX_HEADERSIZE)   /* Macro can be useful for static allocation */
+#include "fse.h"
 
 
 /******************************************
 *  Static allocation
 ******************************************/
-/* You can statically allocate a CTable as a table of U32 using below macro */
+#define FSE_MAX_HEADERSIZE 512
+#define FSE_COMPRESSBOUND(size) (size + (size>>7) + FSE_MAX_HEADERSIZE)   /* Macro can be useful for static allocation */
+/* You can statically allocate a CTable as a table of unsigned using below macro */
 #define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2))
-#define FSE_DTABLE_SIZE_U32(maxTableLog)                   ((1<<maxTableLog)+1)
-
-
-/******************************************
-*  FSE supported API for DLL
-******************************************/
-#include "fse.h"
+#define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
 
 
 /******************************************
@@ -65,7 +60,7 @@
 ******************************************/
 #define FSE_LIST_ERRORS(ITEM) \
         ITEM(FSE_OK_NoError) ITEM(FSE_ERROR_GENERIC) \
-        ITEM(FSE_ERROR_tableLog_tooLarge) ITEM(FSE_ERROR_maxSymbolValue_tooLarge) \
+        ITEM(FSE_ERROR_tableLog_tooLarge) ITEM(FSE_ERROR_maxSymbolValue_tooLarge) ITEM(FSE_ERROR_maxSymbolValue_tooSmall) \
         ITEM(FSE_ERROR_dstSize_tooSmall) ITEM(FSE_ERROR_srcSize_wrong)\
         ITEM(FSE_ERROR_corruptionDetected) \
         ITEM(FSE_ERROR_maxCode)
@@ -77,26 +72,182 @@
 /******************************************
 *  FSE advanced API
 ******************************************/
-size_t FSE_countFast(unsigned* count, const unsigned char* src, size_t srcSize, unsigned* maxSymbolValuePtr);
-/* same as FSE_count(), but won't check if input really respect that all values within src are <= *maxSymbolValuePtr */
+size_t FSE_countFast(unsigned* count, unsigned* maxSymbolValuePtr, const unsigned char* src, size_t srcSize);
+/* same as FSE_count(), but blindly trust that all values within src are <= maxSymbolValuePtr[0] */
 
-size_t FSE_buildCTable_raw (void* CTable, unsigned nbBits);
-/* create a fake CTable, designed to not compress an input where each element uses nbBits */
+size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
+/* build a fake FSE_CTable, designed to not compress an input, where each symbol uses nbBits */
 
-size_t FSE_buildCTable_rle (void* CTable, unsigned char symbolValue);
-/* create a fake CTable, designed to compress a single identical value */
+size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
+/* build a fake FSE_CTable, designed to compress always the same symbolValue */
 
-size_t FSE_buildDTable_raw (void* DTable, unsigned nbBits);
-/* create a fake DTable, designed to read an uncompressed bitstream where each element uses nbBits */
+size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+/* build a fake FSE_DTable, designed to read an uncompressed bitstream where each symbol uses nbBits */
 
-size_t FSE_buildDTable_rle (void* DTable, unsigned char symbolValue);
-/* create a fake DTable, designed to always generate the same symbolValue */
+size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+/* build a fake FSE_DTable, designed to always generate the same symbolValue */
 
 
 /******************************************
-*  FSE streaming API
+*  FSE symbol compression API
 ******************************************/
-bitD_t FSE_readBitsFast(FSE_DStream_t* bitD, unsigned nbBits);
+/*
+   This API consists of small unitary functions, which highly benefit from being inlined.
+   You will want to enable link-time-optimization to ensure these functions are properly inlined in your binary.
+   Visual seems to do it automatically.
+   For gcc or clang, you'll need to add -flto flag at compilation and linking stages.
+*/
+
+typedef struct
+{
+    size_t bitContainer;
+    int    bitPos;
+    char*  startPtr;
+    char*  ptr;
+} FSE_CStream_t;
+
+typedef struct
+{
+    ptrdiff_t   value;
+    const void* stateTable;
+    const void* symbolTT;
+    unsigned    stateLog;
+} FSE_CState_t;
+
+void   FSE_initCStream(FSE_CStream_t* bitC, void* dstBuffer);
+void   FSE_initCState(FSE_CState_t* CStatePtr, const FSE_CTable* ct);
+
+void   FSE_encodeSymbol(FSE_CStream_t* bitC, FSE_CState_t* CStatePtr, unsigned char symbol);
+void   FSE_addBits(FSE_CStream_t* bitC, size_t value, unsigned nbBits);
+void   FSE_flushBits(FSE_CStream_t* bitC);
+
+void   FSE_flushCState(FSE_CStream_t* bitC, const FSE_CState_t* CStatePtr);
+size_t FSE_closeCStream(FSE_CStream_t* bitC);
+
+/*
+These functions are inner components of FSE_compress_usingCTable().
+They allow the creation of custom streams, mixing multiple tables and bit sources.
+
+A key property to keep in mind is that encoding and decoding are done **in reverse direction**.
+So the first symbol you will encode is the last you will decode, like a LIFO stack.
+
+You will need a few variables to track your CStream. They are :
+
+FSE_CTable ct;        // Provided by FSE_buildCTable()
+FSE_CStream_t bitC;   // bitStream tracking structure
+FSE_CState_t state;   // State tracking structure (can have several)
+
+
+The first thing to do is to init bitStream and state.
+    FSE_initCStream(&bitC, dstBuffer);
+    FSE_initCState(&state, ct);
+
+You can then encode your input data, byte after byte.
+FSE_encodeByte() outputs a maximum of 'tableLog' bits at a time.
+Remember decoding will be done in reverse direction.
+    FSE_encodeByte(&bitStream, &state, symbol);
+
+At any time, you can also add any bit sequence.
+Note : maximum allowed nbBits is 25, for compatibility with 32-bits decoders
+    FSE_addBits(&bitStream, bitField, nbBits);
+
+The above methods don't commit data to memory, they just store it into local register, for speed.
+Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+Writing data to memory is a manual operation, performed by the flushBits function.
+    FSE_flushBits(&bitStream);
+
+Your last FSE encoding operation shall be to flush your last state value(s).
+    FSE_flushState(&bitStream, &state);
+
+Finally, you must then close the bitStream.
+The function returns the size in bytes of CStream.
+If there is an error, it returns an errorCode (which can be tested using FSE_isError()).
+    size_t size = FSE_closeCStream(&bitStream);
+*/
+
+
+/******************************************
+*  FSE symbol decompression API
+******************************************/
+typedef struct
+{
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+} FSE_DStream_t;
+
+typedef struct
+{
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+
+size_t FSE_initDStream(FSE_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+void   FSE_initDState(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD, const FSE_DTable* dt);
+
+unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD);
+size_t        FSE_readBits(FSE_DStream_t* bitD, unsigned nbBits);
+unsigned int  FSE_reloadDStream(FSE_DStream_t* bitD);
+
+unsigned FSE_endOfDStream(const FSE_DStream_t* bitD);
+unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
+
+/*
+Let's now decompose FSE_decompress_usingDTable() into its unitary components.
+You will decode FSE-encoded symbols from the bitStream,
+and also any other bitFields you put in, **in reverse order**.
+
+You will need a few variables to track your bitStream. They are :
+
+FSE_DStream_t DStream;  // Stream context
+FSE_DState_t DState;    // State context. Multiple ones are possible
+FSE_DTable dt;          // Decoding table, provided by FSE_buildDTable()
+U32 tableLog;           // Provided by FSE_readHeader()
+
+The first thing to do is to init the bitStream.
+    errorCode = FSE_initDStream(&DStream, &optionalId, srcBuffer, srcSize);
+
+You should then retrieve your initial state(s) :
+    errorCode = FSE_initDState(&DState, &DStream, dt, tableLog);
+
+You can then decode your data, symbol after symbol.
+For information the maximum number of bits read by FSE_decodeSymbol() is 'tableLog'.
+Keep in mind that symbols are decoded in reverse order, like a LIFO stack (last in, first out).
+    unsigned char symbol = FSE_decodeSymbol(&DState, &DStream);
+
+You can retrieve any bitfield you eventually stored into the bitStream (in reverse order)
+Note : maximum allowed nbBits is 25
+    unsigned int bitField = FSE_readBits(&DStream, nbBits);
+
+All above operations only read from local register (which size is controlled by bitD_t==32 bits).
+Refueling the register from memory is manually performed by the reload method.
+    endSignal = FSE_reloadDStream(&DStream);
+
+FSE_reloadDStream() result tells if there is still some more data to read from DStream.
+0 : there is still some data left into the DStream.
+1 : Dstream reached end of buffer, but is not yet fully extracted. It will not load data from memory any more.
+2 : Dstream reached its exact end, corresponding in general to decompression completed.
+3 : Dstream went too far. Decompression result is corrupted.
+
+When reaching end of buffer(1), progress slowly, notably if you decode multiple symbols per loop,
+to properly detect the exact end of stream.
+After each decoded symbol, check if DStream is fully consumed using this simple test :
+    FSE_reloadDStream(&DStream) >= 2
+
+When it's done, verify decompression is fully completed, by checking both DStream and the relevant states.
+Checking if DStream has reached its end is performed by :
+    FSE_endOfDStream(&DStream);
+Check also the states. There might be some entropy left there, able to decode some high probability (>50%) symbol.
+    FSE_endOfDState(&DState);
+*/
+
+
+/******************************************
+*  FSE unsafe symbol API
+******************************************/
+size_t FSE_readBitsFast(FSE_DStream_t* bitD, unsigned nbBits);
 /* faster, but works only if nbBits >= 1 (otherwise, result will be corrupted) */
 
 unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD);
diff --git a/lib/zstd.c b/lib/zstd.c
index cd58220..a4e77fd 100644
--- a/lib/zstd.c
+++ b/lib/zstd.c
@@ -535,7 +535,7 @@
 /* return : size of CStream in bits */
 static size_t ZSTD_compressLiterals_usingCTable(void* dst, size_t dstSize,
                                           const void* src, size_t srcSize,
-                                          const void* CTable)
+                                          const FSE_CTable* CTable)
 {
     const BYTE* const istart = (const BYTE*)src;
     const BYTE* ip = istart;
@@ -553,32 +553,32 @@
     // join to mod 2
     if (srcSize & 1)
     {
-        FSE_encodeByte(&bitC, &CState1, *ip++);
+        FSE_encodeSymbol(&bitC, &CState1, *ip++);
         FSE_flushBits(&bitC);
     }
 
     // join to mod 4
     if ((sizeof(size_t)*8 > LitFSELog*4+7 ) && (srcSize & 2))   // test bit 2
     {
-        FSE_encodeByte(&bitC, &CState2, *ip++);
-        FSE_encodeByte(&bitC, &CState1, *ip++);
+        FSE_encodeSymbol(&bitC, &CState2, *ip++);
+        FSE_encodeSymbol(&bitC, &CState1, *ip++);
         FSE_flushBits(&bitC);
     }
 
     // 2 or 4 encoding per loop
     while (ip<iend)
     {
-        FSE_encodeByte(&bitC, &CState2, *ip++);
+        FSE_encodeSymbol(&bitC, &CState2, *ip++);
 
         if (sizeof(size_t)*8 < LitFSELog*2+7 )   // this test must be static
             FSE_flushBits(&bitC);
 
-        FSE_encodeByte(&bitC, &CState1, *ip++);
+        FSE_encodeSymbol(&bitC, &CState1, *ip++);
 
         if (sizeof(size_t)*8 > LitFSELog*4+7 )   // this test must be static
         {
-            FSE_encodeByte(&bitC, &CState2, *ip++);
-            FSE_encodeByte(&bitC, &CState1, *ip++);
+            FSE_encodeSymbol(&bitC, &CState2, *ip++);
+            FSE_encodeSymbol(&bitC, &CState1, *ip++);
         }
 
         FSE_flushBits(&bitC);
@@ -618,7 +618,7 @@
     if (dstSize < FSE_compressBound(srcSize)) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
 
     /* Scan input and build symbol stats */
-    errorCode = FSE_count (count, ip, srcSize, &maxSymbolValue);
+    errorCode = FSE_count (count, &maxSymbolValue, ip, srcSize);
     if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
     if (errorCode == srcSize) return 1;
     if (errorCode < (srcSize >> 6)) return 0;   /* cheap heuristic : probably not compressible enough */
@@ -628,14 +628,14 @@
     if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
 
     /* Write table description header */
-    errorCode = FSE_writeHeader (op, FSE_MAX_HEADERSIZE, norm, maxSymbolValue, tableLog);
+    errorCode = FSE_writeNCount (op, FSE_MAX_HEADERSIZE, norm, maxSymbolValue, tableLog);
     if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
     op += errorCode;
 
     /* Compress */
-    errorCode = FSE_buildCTable (&CTable, norm, maxSymbolValue, tableLog);
+    errorCode = FSE_buildCTable (CTable, norm, maxSymbolValue, tableLog);
     if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-    errorCode = ZSTD_compressLiterals_usingCTable(op, oend - op, ip, srcSize, &CTable);
+    errorCode = ZSTD_compressLiterals_usingCTable(op, oend - op, ip, srcSize, CTable);
     if (ZSTD_isError(errorCode)) return errorCode;
     op += errorCode;
 
@@ -738,7 +738,7 @@
 
     /* Encoding table of Literal Lengths */
     max = MaxLL;
-    mostFrequent = FSE_countFast(count, seqStorePtr->litLengthStart, nbSeq, &max);
+    mostFrequent = FSE_countFast(count, &max, seqStorePtr->litLengthStart, nbSeq);
     if (mostFrequent == nbSeq)
     {
         *op++ = *(seqStorePtr->litLengthStart);
@@ -754,7 +754,7 @@
     {
         tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max);
         FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
-        op += FSE_writeHeader(op, maxDstSize, norm, max, tableLog);
+        op += FSE_writeNCount(op, maxDstSize, norm, max, tableLog);
         FSE_buildCTable(CTable_LitLength, norm, max, tableLog);
         LLtype = bt_compressed;
     }
@@ -771,7 +771,7 @@
             if (op_offset_start[i]==0) offsetBits_start[i]=0;
         }
         offsetBitsPtr += nbSeq;
-        mostFrequent = FSE_countFast(count, offsetBits_start, nbSeq, &max);
+        mostFrequent = FSE_countFast(count, &max, offsetBits_start, nbSeq);
     }
     if (mostFrequent == nbSeq)
     {
@@ -788,14 +788,14 @@
     {
         tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max);
         FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
-        op += FSE_writeHeader(op, maxDstSize, norm, max, tableLog);
+        op += FSE_writeNCount(op, maxDstSize, norm, max, tableLog);
         FSE_buildCTable(CTable_OffsetBits, norm, max, tableLog);
         Offtype = bt_compressed;
     }
 
     /* Encoding Table of MatchLengths */
     max = MaxML;
-    mostFrequent = FSE_countFast(count, seqStorePtr->matchLengthStart, nbSeq, &max);
+    mostFrequent = FSE_countFast(count, &max, seqStorePtr->matchLengthStart, nbSeq);
     if (mostFrequent == nbSeq)
     {
         *op++ = *seqStorePtr->matchLengthStart;
@@ -811,7 +811,7 @@
     {
         tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max);
         FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
-        op += FSE_writeHeader(op, maxDstSize, norm, max, tableLog);
+        op += FSE_writeNCount(op, maxDstSize, norm, max, tableLog);
         FSE_buildCTable(CTable_MatchLength, norm, max, tableLog);
         MLtype = bt_compressed;
     }
@@ -836,12 +836,12 @@
             BYTE offCode = *(--offsetBitsPtr);                              /* 32b*/  /* 64b*/
             U32 nbBits = (offCode-1) * (!!offCode);
             BYTE litLength = *(--op_litLength);                             /* (7)*/  /* (7)*/
-            FSE_encodeByte(&blockStream, &stateMatchLength, matchLength);   /* 17 */  /* 17 */
+            FSE_encodeSymbol(&blockStream, &stateMatchLength, matchLength); /* 17 */  /* 17 */
             if (ZSTD_32bits()) FSE_flushBits(&blockStream);                 /*  7 */
             FSE_addBits(&blockStream, offset, nbBits);                      /* 32 */  /* 42 */
             if (ZSTD_32bits()) FSE_flushBits(&blockStream);                 /*  7 */
-            FSE_encodeByte(&blockStream, &stateOffsetBits, offCode);        /* 16 */  /* 51 */
-            FSE_encodeByte(&blockStream, &stateLitLength, litLength);       /* 26 */  /* 61 */
+            FSE_encodeSymbol(&blockStream, &stateOffsetBits, offCode);      /* 16 */  /* 51 */
+            FSE_encodeSymbol(&blockStream, &stateLitLength, litLength);     /* 26 */  /* 61 */
             FSE_flushBits(&blockStream);                                    /*  7 */  /*  7 */
         }
 
@@ -1243,7 +1243,7 @@
 FORCE_INLINE size_t ZSTD_decompressLiterals_usingDTable_generic(
                        void* const dst, size_t maxDstSize,
                  const void* src, size_t srcSize,
-                 const void* DTable, U32 fast)
+                 const FSE_DTable* DTable, U32 fast)
 {
     BYTE* op = (BYTE*) dst;
     BYTE* const olimit = op;
@@ -1303,7 +1303,7 @@
 static size_t ZSTD_decompressLiterals_usingDTable(
                        void* const dst, size_t maxDstSize,
                  const void* src, size_t srcSize,
-                 const void* DTable, U32 fast)
+                 const FSE_DTable* DTable, U32 fast)
 {
     if (fast) return ZSTD_decompressLiterals_usingDTable_generic(dst, maxDstSize, src, srcSize, DTable, 1);
     return ZSTD_decompressLiterals_usingDTable_generic(dst, maxDstSize, src, srcSize, DTable, 0);
@@ -1315,7 +1315,7 @@
     /* assumed : blockType == blockCompressed */
     const BYTE* ip = (const BYTE*)src;
     short norm[256];
-    void* DTable = ctx;
+    FSE_DTable* DTable = (FSE_DTable*)ctx;
     U32 maxSymbolValue = 255;
     U32 tableLog;
     U32 fastMode;
@@ -1323,7 +1323,7 @@
 
     if (srcSize < 2) return (size_t)-ZSTD_ERROR_wrongLBlockSize;   /* too small input size */
 
-    errorCode = FSE_readHeader (norm, &maxSymbolValue, &tableLog, ip, srcSize);
+    errorCode = FSE_readNCount (norm, &maxSymbolValue, &tableLog, ip, srcSize);
     if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
     ip += errorCode;
     srcSize -= errorCode;
@@ -1380,7 +1380,7 @@
 
 
 size_t ZSTD_decodeSeqHeaders(size_t* lastLLPtr, const BYTE** dumpsPtr,
-                               void* DTableLL, void* DTableML, void* DTableOffb,
+                         FSE_DTable* DTableLL, FSE_DTable* DTableML, FSE_DTable* DTableOffb,
                          const void* src, size_t srcSize)
 {
     const BYTE* const istart = (const BYTE* const)src;
@@ -1427,7 +1427,7 @@
             FSE_buildDTable_raw(DTableLL, LLbits); break;
         default :
             max = MaxLL;
-            headerSize = FSE_readHeader(norm, &max, &LLlog, ip, iend-ip);
+            headerSize = FSE_readNCount(norm, &max, &LLlog, ip, iend-ip);
             if (FSE_isError(headerSize)) return (size_t)-ZSTD_ERROR_GENERIC;
             ip += headerSize;
             FSE_buildDTable(DTableLL, norm, max, LLlog);
@@ -1444,7 +1444,7 @@
             FSE_buildDTable_raw(DTableOffb, Offbits); break;
         default :
             max = MaxOff;
-            headerSize = FSE_readHeader(norm, &max, &Offlog, ip, iend-ip);
+            headerSize = FSE_readNCount(norm, &max, &Offlog, ip, iend-ip);
             if (FSE_isError(headerSize)) return (size_t)-ZSTD_ERROR_GENERIC;
             ip += headerSize;
             FSE_buildDTable(DTableOffb, norm, max, Offlog);
@@ -1461,7 +1461,7 @@
             FSE_buildDTable_raw(DTableML, MLbits); break;
         default :
             max = MaxML;
-            headerSize = FSE_readHeader(norm, &max, &MLlog, ip, iend-ip);
+            headerSize = FSE_readNCount(norm, &max, &MLlog, ip, iend-ip);
             if (FSE_isError(headerSize)) return (size_t)-ZSTD_ERROR_GENERIC;
             ip += headerSize;
             FSE_buildDTable(DTableML, norm, max, MLlog);
@@ -1489,9 +1489,9 @@
     const BYTE* litEnd;
     const size_t dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};   /* added */
     const size_t dec64table[] = {8, 8, 8, 7, 8, 9,10,11};   /* substracted */
-    void* DTableML = ctx;
-    void* DTableLL = ((U32*)ctx) + FSE_DTABLE_SIZE_U32(MLFSELog);
-    void* DTableOffb = ((U32*)DTableLL) + FSE_DTABLE_SIZE_U32(LLFSELog);
+    FSE_DTable* DTableML = (FSE_DTable*)ctx;
+    FSE_DTable* DTableLL = DTableML + FSE_DTABLE_SIZE_U32(MLFSELog);
+    FSE_DTable* DTableOffb = DTableLL + FSE_DTABLE_SIZE_U32(LLFSELog);
 
     /* blockType == blockCompressed, srcSize is trusted */
 
diff --git a/programs/fullbench.c b/programs/fullbench.c
index 7afd12c..11d9b62 100644
--- a/programs/fullbench.c
+++ b/programs/fullbench.c
@@ -230,7 +230,7 @@
 
 extern size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr);
 extern size_t ZSTD_decodeLiteralsBlock(void* ctx, void* dst, size_t maxDstSize, const BYTE** litPtr, const void* src, size_t srcSize);
-extern size_t ZSTD_decodeSeqHeaders(size_t* lastLLPtr, const BYTE** dumpsPtr, void* DTableLL, void* DTableML, void* DTableOffb, const void* src, size_t srcSize);
+extern size_t ZSTD_decodeSeqHeaders(size_t* lastLLPtr, const BYTE** dumpsPtr, FSE_DTable* DTableLL, FSE_DTable* DTableML, FSE_DTable* DTableOffb, const void* src, size_t srcSize);
 
 
 size_t local_ZSTD_compress(void* dst, size_t dstSize, void* buff2, const void* src, size_t srcSize)