minor compression gain
diff --git a/lib/fse.c b/lib/fse.c
index 1f382ed..606dcbc 100644
--- a/lib/fse.c
+++ b/lib/fse.c
@@ -205,7 +205,7 @@
                 break;
             case -1:
             case  1:
-                symbolTT[s].deltaNbBits = tableLog << 16;
+                symbolTT[s].deltaNbBits = (tableLog << 16) - (1<<tableLog);
                 symbolTT[s].deltaFindState = total - 1;
                 total ++;
                 break;
@@ -216,10 +216,7 @@
                     symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
                     symbolTT[s].deltaFindState = total - normalizedCounter[s];
                     total +=  normalizedCounter[s];
-                }
-            }
-        }
-    }   /* Build Symbol Transformation Table */
+    }   }   }   }
 
     return 0;
 }
@@ -388,8 +385,7 @@
             out += 2;
             bitStream >>= 16;
             bitCount -= 16;
-        }
-    }
+    }   }
 
     /* flush remaining bitStream */
     if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
@@ -596,8 +592,7 @@
     while (ip<iend) Counting1[*ip++]++;
 
     if (checkMax) {   /* verify stats will fit into destination table */
-        for (s=255; s>maxSymbolValue; s--)
-        {
+        for (s=255; s>maxSymbolValue; s--) {
             Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
             if (Counting1[s]) return ERROR(maxSymbolValue_tooSmall);
     }   }
@@ -854,7 +849,7 @@
 
     /* Build Symbol Transformation Table */
     for (s=0; s<=maxSymbolValue; s++) {
-        symbolTT[s].deltaNbBits = nbBits << 16;
+        symbolTT[s].deltaNbBits = (nbBits << 16) - (1 << nbBits);
         symbolTT[s].deltaFindState = s-1;
     }
 
diff --git a/lib/fse_static.h b/lib/fse_static.h
index eb03163..ca303db 100644
--- a/lib/fse_static.h
+++ b/lib/fse_static.h
@@ -239,6 +239,19 @@
     statePtr->stateLog = tableLog;
 }
 
+MEM_STATIC void FSE_initCState2(FSE_CState_t* statePtr, const FSE_CTable* ct, U32 symbol)
+{
+    FSE_initCState(statePtr, ct);
+    {
+        const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+        const U16* stateTable = (const U16*)(statePtr->stateTable);
+        U32 nbBitsOut  = (U32)((symbolTT.deltaNbBits + (1<<15)) >> 16);
+        statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits;
+        statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+
+    }
+}
+
 MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, U32 symbol)
 {
     const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
@@ -277,6 +290,17 @@
     DStatePtr->table = dt + 1;
 }
 
+MEM_STATIC size_t FSE_getStateValue(FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state;
+}
+
+MEM_STATIC BYTE FSE_peakSymbol(FSE_DState_t* DStatePtr)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    return DInfo.symbol;
+}
+
 MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
 {
     const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
diff --git a/lib/zstd_compress.c b/lib/zstd_compress.c
index bc78d2d..eb3031d 100644
--- a/lib/zstd_compress.c
+++ b/lib/zstd_compress.c
@@ -547,9 +547,13 @@
     if ((oend-op) < MIN_SEQUENCES_SIZE)
         return ERROR(dstSize_tooSmall);
     MEM_writeLE16(op, (U16)nbSeq); op+=2;
-    seqHead = op;
+
+    if (nbSeq==0) goto _check_compressibility;
 
     /* dumps : contains rests of large lengths */
+    if ((oend-op) < 3 /* dumps */ + 1 /*seqHead*/)
+        return ERROR(dstSize_tooSmall);
+    seqHead = op;
     {
         size_t dumpsLength = seqStorePtr->dumps - seqStorePtr->dumpsStart;
         if (dumpsLength < 512) {
@@ -572,9 +576,9 @@
 
     /* CTable for Literal Lengths */
     max = MaxLL;
-    mostFrequent = FSE_countFast(count, &max, seqStorePtr->litLengthStart, nbSeq);
+    mostFrequent = FSE_countFast(count, &max, llTable, nbSeq);
     if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
-        *op++ = *(seqStorePtr->litLengthStart);
+        *op++ = llTable[0];
         FSE_buildCTable_rle(CTable_LitLength, (BYTE)max);
         LLtype = FSE_ENCODING_RLE;
     } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
@@ -584,8 +588,10 @@
         LLtype = FSE_ENCODING_RAW;
     } else {
         size_t NCountSize;
+        size_t nbSeq_1 = nbSeq;
         U32 tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max);
-        FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
+        if (count[llTable[nbSeq-1]]>1) { count[llTable[nbSeq-1]]--; nbSeq_1--; }
+        FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
         NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
         if (FSE_isError(NCountSize)) return ERROR(GENERIC);
         op += NCountSize;
@@ -603,7 +609,7 @@
     max = MaxOff;
     mostFrequent = FSE_countFast(count, &max, offCodeTable, nbSeq);
     if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
-        *op++ = *offCodeTable;
+        *op++ = offCodeTable[0];
         FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max);
         Offtype = FSE_ENCODING_RLE;
     } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
@@ -613,8 +619,10 @@
         Offtype = FSE_ENCODING_RAW;
     } else {
         size_t NCountSize;
+        size_t nbSeq_1 = nbSeq;
         U32 tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max);
-        FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
+        if (count[offCodeTable[nbSeq-1]]>1) { count[offCodeTable[nbSeq-1]]--; nbSeq_1--; }
+        FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
         NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
         if (FSE_isError(NCountSize)) return ERROR(GENERIC);
         op += NCountSize;
@@ -624,9 +632,9 @@
 
     /* CTable for MatchLengths */
     max = MaxML;
-    mostFrequent = FSE_countFast(count, &max, seqStorePtr->matchLengthStart, nbSeq);
+    mostFrequent = FSE_countFast(count, &max, mlTable, nbSeq);
     if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
-        *op++ = *seqStorePtr->matchLengthStart;
+        *op++ = *mlTable;
         FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max);
         MLtype = FSE_ENCODING_RLE;
     } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
@@ -659,22 +667,26 @@
 
         errorCode = BIT_initCStream(&blockStream, op, oend-op);
         if (ERR_isError(errorCode)) return ERROR(dstSize_tooSmall);   /* not enough space remaining */
-        FSE_initCState(&stateMatchLength, CTable_MatchLength);
-        FSE_initCState(&stateOffsetBits, CTable_OffsetBits);
-        FSE_initCState(&stateLitLength, CTable_LitLength);
 
-        for (i=(int)nbSeq-1; i>=0; i--) {
+        /* first symbols */
+        FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlTable[nbSeq-1]);
+        FSE_initCState2(&stateOffsetBits,  CTable_OffsetBits,  offCodeTable[nbSeq-1]);
+        FSE_initCState2(&stateLitLength,   CTable_LitLength,   llTable[nbSeq-1]);
+        BIT_addBits(&blockStream, offsetTable[nbSeq-1], offCodeTable[nbSeq-1] ? (offCodeTable[nbSeq-1]-1) : 0);
+        BIT_flushBits(&blockStream);
+
+        for (i=(int)nbSeq-2; i>=0; i--) {
             BYTE mlCode = mlTable[i];
             U32  offset = offsetTable[i];
             BYTE offCode = offCodeTable[i];                                 /* 32b*/  /* 64b*/
-            U32 nbBits = (offCode-1) * (!!offCode);
+            U32 nbBits = (offCode-1) + (!offCode);
             BYTE litLength = llTable[i];                                    /* (7)*/  /* (7)*/
             FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode);      /* 17 */  /* 17 */
             if (MEM_32bits()) BIT_flushBits(&blockStream);                  /*  7 */
-            BIT_addBits(&blockStream, offset, nbBits);                      /* 31 */  /* 42 */   /* 24 bits max in 32-bits mode */
-            if (MEM_32bits()) BIT_flushBits(&blockStream);                  /*  7 */
-            FSE_encodeSymbol(&blockStream, &stateOffsetBits, offCode);      /* 16 */  /* 51 */
             FSE_encodeSymbol(&blockStream, &stateLitLength, litLength);     /* 26 */  /* 61 */
+            FSE_encodeSymbol(&blockStream, &stateOffsetBits, offCode);      /* 16 */  /* 51 */
+            if (MEM_32bits()) BIT_flushBits(&blockStream);                  /*  7 */
+            BIT_addBits(&blockStream, offset, nbBits);                      /* 31 */  /* 42 */   /* 24 bits max in 32-bits mode */
             BIT_flushBits(&blockStream);                                    /*  7 */  /*  7 */
         }
 
@@ -688,14 +700,15 @@
     }
 
     /* check compressibility */
+_check_compressibility:
     if ((size_t)(op-ostart) >= maxCSize) return 0;
 
     return op - ostart;
 }
 
 
-/** ZSTD_storeSeq
-    Store a sequence (literal length, literals, offset code and match length) into seqStore_t
+/*! ZSTD_storeSeq
+    Store a sequence (literal length, literals, offset code and match length code) into seqStore_t
     @offsetCode : distance to match, or 0 == repCode
     @matchCode : matchLength - MINMATCH
 */
diff --git a/lib/zstd_decompress.c b/lib/zstd_decompress.c
index b50e474..cdc7d30 100644
--- a/lib/zstd_decompress.c
+++ b/lib/zstd_decompress.c
@@ -426,7 +426,7 @@
                 break;
             }
 
-            if (litSize+WILDCOPY_OVERLENGTH > srcSize)   /* risk reading beyond src buffer with wildcopy */
+            if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize)   /* risk reading beyond src buffer with wildcopy */
             {
                 if (litSize > srcSize-lhSize) return ERROR(corruption_detected);
                 memcpy(dctx->litBuffer, istart+lhSize, litSize);
@@ -483,10 +483,12 @@
     size_t dumpsLength;
 
     /* check */
-    if (srcSize < 5) return ERROR(srcSize_wrong);
+    if (srcSize < MIN_SEQUENCES_SIZE) return ERROR(srcSize_wrong);
 
     /* SeqHead */
     *nbSeq = MEM_readLE16(ip); ip+=2;
+    if (*nbSeq==0) return 2;
+
     LLtype  = *ip >> 6;
     Offtype = (*ip >> 4) & 3;
     MLtype  = (*ip >> 2) & 3;
@@ -589,8 +591,8 @@
 
 typedef struct {
     size_t litLength;
-    size_t offset;
     size_t matchLength;
+    size_t offset;
 } seq_t;
 
 typedef struct {
@@ -603,7 +605,6 @@
     const BYTE* dumpsEnd;
 } seqState_t;
 
-
 static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
 {
     size_t litLength;
@@ -614,7 +615,7 @@
     const BYTE* const de = seqState->dumpsEnd;
 
     /* Literal length */
-    litLength = FSE_decodeSymbol(&(seqState->stateLL), &(seqState->DStream));
+    litLength = FSE_peakSymbol(&(seqState->stateLL));
     prevOffset = litLength ? seq->offset : seqState->prevOffset;
     if (litLength == MaxLL) {
         U32 add = *dumps++;
@@ -632,17 +633,20 @@
                 1 /*fake*/, 1, 2, 4, 8, 16, 32, 64, 128, 256,
                 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144,
                 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, /*fake*/ 1, 1, 1, 1, 1 };
-        U32 offsetCode, nbBits;
-        offsetCode = FSE_decodeSymbol(&(seqState->stateOffb), &(seqState->DStream));   /* <= maxOff, by table construction */
-        if (MEM_32bits()) BIT_reloadDStream(&(seqState->DStream));
-        nbBits = offsetCode - 1;
+        U32 offsetCode = FSE_peakSymbol(&(seqState->stateOffb));   /* <= maxOff, by table construction */
+        U32 nbBits = offsetCode - 1;
         if (offsetCode==0) nbBits = 0;   /* cmove */
         offset = offsetPrefix[offsetCode] + BIT_readBits(&(seqState->DStream), nbBits);
         if (MEM_32bits()) BIT_reloadDStream(&(seqState->DStream));
-        if (offsetCode==0) offset = prevOffset;   /* cmove */
+        if (offsetCode==0) offset = prevOffset;   /* repcode, cmove */
         if (offsetCode | !litLength) seqState->prevOffset = seq->offset;   /* cmove */
+        FSE_decodeSymbol(&(seqState->stateOffb), &(seqState->DStream));    /* update */
     }
 
+    /* Literal length update */
+    FSE_decodeSymbol(&(seqState->stateLL), &(seqState->DStream));   /* update */
+    if (MEM_32bits()) BIT_reloadDStream(&(seqState->DStream));
+
     /* MatchLength */
     matchLength = FSE_decodeSymbol(&(seqState->stateML), &(seqState->DStream));
     if (matchLength == MaxML) {
@@ -778,7 +782,7 @@
     ip += errorCode;
 
     /* Regen sequences */
-    {
+    if (nbSeq) {
         seq_t sequence;
         seqState_t seqState;
 
@@ -803,16 +807,18 @@
         }
 
         /* check if reached exact end */
-        if ( !BIT_endOfDStream(&(seqState.DStream)) ) return ERROR(corruption_detected);   /* DStream should be entirely and exactly consumed; otherwise data is corrupted */
+        if (nbSeq)
+            return ERROR(corruption_detected);   /* DStream should be entirely and exactly consumed; otherwise data is corrupted */
+    }
 
-        /* last literal segment */
-        {
-            size_t lastLLSize = litEnd - litPtr;
-            if (litPtr > litEnd) return ERROR(corruption_detected);
-            if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall);
-            if (op != litPtr) memcpy(op, litPtr, lastLLSize);
-            op += lastLLSize;
-    }   }
+    /* last literal segment */
+    {
+        size_t lastLLSize = litEnd - litPtr;
+        if (litPtr > litEnd) return ERROR(corruption_detected);   /* too many literals already used */
+        if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall);
+        memcpy(op, litPtr, lastLLSize);
+        op += lastLLSize;
+    }
 
     return op-ostart;
 }
diff --git a/lib/zstd_internal.h b/lib/zstd_internal.h
index 7eda813..0b993b1 100644
--- a/lib/zstd_internal.h
+++ b/lib/zstd_internal.h
@@ -102,7 +102,7 @@
 
 #define HufLog 12
 
-#define MIN_SEQUENCES_SIZE (2 /*seqNb*/ + 2 /*dumps*/ + 3 /*seqTables*/ + 1 /*bitStream*/)
+#define MIN_SEQUENCES_SIZE 2 /*seqNb*/
 #define MIN_CBLOCK_SIZE (1 /*litCSize*/ + MIN_SEQUENCES_SIZE)
 
 #define WILDCOPY_OVERLENGTH 8