speed up small blocks
diff --git a/lib/common/entropy_common.c b/lib/common/entropy_common.c
index 6b825af..0a13d9d 100644
--- a/lib/common/entropy_common.c
+++ b/lib/common/entropy_common.c
@@ -50,6 +50,7 @@
U32 bitStream;
int bitCount;
unsigned charnum = 0;
+ unsigned const maxSV1 = *maxSVPtr + 1;
int previous0 = 0;
if (hbSize < 4) {
@@ -76,27 +77,39 @@
threshold = 1<<nbBits;
nbBits++;
- while ((remaining>1) & (charnum<=*maxSVPtr)) {
+ for (;;) {
if (previous0) {
- unsigned n0 = charnum;
- while ((bitStream & 0xFFFF) == 0xFFFF) {
- n0 += 24;
- if (ip < iend-5) {
- ip += 2;
+ // TODO: Generalize to FSE_countTrailingZeros() or something
+ int repeats = __builtin_ctz(~bitStream) >> 1;
+ while (repeats >= 12) {
+ charnum += 3 * 12;
+ if (ip < iend-6) {
+ ip += 3;
bitStream = MEM_readLE32(ip) >> bitCount;
} else {
- bitStream >>= 16;
- bitCount += 16;
- } }
- while ((bitStream & 3) == 3) {
- n0 += 3;
- bitStream >>= 2;
- bitCount += 2;
+ bitStream >>= 24;
+ bitCount += 24;
+ }
+ repeats = __builtin_ctz(~bitStream) >> 1;
}
- n0 += bitStream & 3;
+ charnum += 3 * repeats;
+ bitStream >>= 2 * repeats;
+ bitCount += 2 * repeats;
+
+ assert(bitCount < 30 && (bitStream & 3) != 3);
+ charnum += bitStream & 3;
bitCount += 2;
- if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
- while (charnum < n0) normalizedCounter[charnum++] = 0;
+
+ /* This is an error, but break and return an error
+ * at the end, because returning out of a loop makes
+ * it harder for the compiler to optimize.
+ */
+ if (charnum >= maxSV1) break;
+
+ /* We don't need to set the normalized count to 0
+ * because we already memset the whole buffer to 0.
+ */
+
if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
assert((bitCount >> 3) <= 3); /* For first condition to work */
ip += bitCount>>3;
@@ -104,8 +117,10 @@
bitStream = MEM_readLE32(ip) >> bitCount;
} else {
bitStream >>= 2;
- } }
- { int const max = (2*threshold-1) - remaining;
+ }
+ }
+ {
+ int const max = (2*threshold-1) - remaining;
int count;
if ((bitStream & (threshold-1)) < (U32)max) {
@@ -118,15 +133,31 @@
}
count--; /* extra accuracy */
- remaining -= count < 0 ? -count : count; /* -1 means +1 */
+ /* When it matters (small blocks), this is a
+ * predictable branch, because we don't use -1.
+ */
+ if (count >= 0) {
+ remaining -= count;
+ } else {
+ assert(count == -1);
+ remaining += count;
+ }
normalizedCounter[charnum++] = (short)count;
previous0 = !count;
- while (remaining < threshold) {
- nbBits--;
- threshold >>= 1;
- }
- if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+ assert(threshold > 1);
+ if (remaining < threshold) {
+ /* This branch can be folded into the
+ * threshold update condition because we
+ * know that threshold > 1.
+ */
+ if (remaining <= 1) break;
+ nbBits = BIT_highbit32(remaining) + 1;
+ threshold = 1 << (nbBits - 1);
+ }
+ if (charnum >= maxSV1) break;
+
+ if (LIKELY((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))) {
ip += bitCount>>3;
bitCount &= 7;
} else {
@@ -134,8 +165,10 @@
ip = iend - 4;
}
bitStream = MEM_readLE32(ip) >> (bitCount & 31);
- } } /* while ((remaining>1) & (charnum<=*maxSVPtr)) */
+ } }
if (remaining != 1) return ERROR(corruption_detected);
+ /* Only possible when there are too many zeros. */
+ if (charnum > maxSV1) return ERROR(maxSymbolValue_tooSmall);
if (bitCount > 32) return ERROR(corruption_detected);
*maxSVPtr = charnum-1;
@@ -143,7 +176,6 @@
return ip-istart;
}
-
/*! HUF_readStats() :
Read compact Huffman tree, saved by HUF_writeCTable().
`huffWeight` is destination buffer.
diff --git a/lib/compress/fse_compress.c b/lib/compress/fse_compress.c
index 5290a91..1187e3e 100644
--- a/lib/compress/fse_compress.c
+++ b/lib/compress/fse_compress.c
@@ -341,6 +341,8 @@
return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2);
}
+// TODO: Emit -1 based on # of symbols
+#define LOW_PROB 0
/* Secondary normalization method.
To be used when primary method fails. */
@@ -361,7 +363,7 @@
norm[s]=0;
continue;
}
- if (count[s] <= lowThreshold) {
+ if (LOW_PROB && count[s] <= lowThreshold) {
norm[s] = -1;
distributed++;
total -= count[s];
@@ -431,7 +433,6 @@
return 0;
}
-
size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
const unsigned* count, size_t total,
unsigned maxSymbolValue)
@@ -455,7 +456,7 @@
for (s=0; s<=maxSymbolValue; s++) {
if (count[s] == total) return 0; /* rle special case */
if (count[s] == 0) { normalizedCounter[s]=0; continue; }
- if (count[s] <= lowThreshold) {
+ if (LOW_PROB && count[s] <= lowThreshold) {
normalizedCounter[s] = -1;
stillToDistribute--;
} else {
@@ -476,20 +477,6 @@
else normalizedCounter[largest] += (short)stillToDistribute;
}
-#if 0
- { /* Print Table (debug) */
- U32 s;
- U32 nTotal = 0;
- for (s=0; s<=maxSymbolValue; s++)
- RAWLOG(2, "%3i: %4i \n", s, normalizedCounter[s]);
- for (s=0; s<=maxSymbolValue; s++)
- nTotal += abs(normalizedCounter[s]);
- if (nTotal != (1U<<tableLog))
- RAWLOG(2, "Warning !!! Total == %u != %u !!!", nTotal, 1U<<tableLog);
- getchar();
- }
-#endif
-
return tableLog;
}
diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c
index be5c7cf..7d321b2 100644
--- a/lib/decompress/zstd_decompress.c
+++ b/lib/decompress/zstd_decompress.c
@@ -1091,7 +1091,8 @@
ZSTD_buildFSETable( entropy->OFTable,
offcodeNCount, offcodeMaxValue,
OF_base, OF_bits,
- offcodeLog);
+ offcodeLog,
+ entropy->workspace, sizeof(entropy->workspace));
dictPtr += offcodeHeaderSize;
}
@@ -1104,7 +1105,8 @@
ZSTD_buildFSETable( entropy->MLTable,
matchlengthNCount, matchlengthMaxValue,
ML_base, ML_bits,
- matchlengthLog);
+ matchlengthLog,
+ entropy->workspace, sizeof(entropy->workspace));
dictPtr += matchlengthHeaderSize;
}
@@ -1117,7 +1119,8 @@
ZSTD_buildFSETable( entropy->LLTable,
litlengthNCount, litlengthMaxValue,
LL_base, LL_bits,
- litlengthLog);
+ litlengthLog,
+ entropy->workspace, sizeof(entropy->workspace));
dictPtr += litlengthHeaderSize;
}
diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c
index e93d6fe..95afcaa 100644
--- a/lib/decompress/zstd_decompress_block.c
+++ b/lib/decompress/zstd_decompress_block.c
@@ -368,19 +368,18 @@
ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
const short* normalizedCounter, unsigned maxSymbolValue,
const U32* baseValue, const U32* nbAdditionalBits,
- unsigned tableLog)
+ unsigned tableLog, U32* wksp, size_t wkspSize)
{
ZSTD_seqSymbol* const tableDecode = dt+1;
U16 symbolNext[MaxSeq+1];
U32 const maxSV1 = maxSymbolValue + 1;
U32 const tableSize = 1 << tableLog;
- U32 highThreshold = tableSize-1;
/* Sanity Checks */
assert(maxSymbolValue <= MaxSeq);
assert(tableLog <= MaxFSELog);
-
+ U32 highThreshold = tableSize - 1;
/* Init, lay down lowprob symbols */
{ ZSTD_seqSymbol_header DTableH;
DTableH.tableLog = tableLog;
@@ -400,12 +399,68 @@
}
/* Spread symbols */
- { U32 const tableMask = tableSize-1;
+ assert(tableSize <= 512);
+ /* Specialized symbol spreading for the case when there are
+ * no low probability (-1 count) symbols. When compressing
+ * small blocks we avoid low probability symbols to hit this
+ * case, since header decoding speed matters more.
+ */
+ if (highThreshold == tableSize - 1) {
+ size_t const tableMask = tableSize-1;
+ size_t const step = FSE_TABLESTEP(tableSize);
+ /* First lay down the symbols in order.
+ * We use a uint64_t to lay down 8 bytes at a time. This reduces branch
+ * misses since small blocks generally have small table logs, so nearly
+ * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
+ * our buffer to handle the over-write.
+ */
+ BYTE* spread = (BYTE*)wksp;
+ assert(wkspSize >= (1u << MaxFSELog) + sizeof(U64));
+ (void)wkspSize;
+ {
+ U64 const add = 0x0101010101010101ull;
+ size_t pos = 0;
+ U64 sv = 0;
+ U32 s;
+ for (s=0; s<maxSV1; ++s, sv += add) {
+ int i;
+ int const n = normalizedCounter[s];
+ MEM_write64(spread + pos, sv);
+ for (i = 8; i < n; i += 8) {
+ MEM_write64(spread + pos + i, sv);
+ }
+ pos += n;
+ }
+ }
+ /* Now we spread those positions across the table.
+ * The benefit of doing it in two stages is that we avoid the the
+ * variable size inner loop, which caused lots of branch misses.
+ * Now we can run through all the positions without any branch misses.
+ * We unroll the loop twice, since that is what emperically worked best.
+ */
+ {
+ size_t position = 0;
+ size_t s;
+ size_t const unroll = 2;
+ assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
+ for (s = 0; s < (size_t)tableSize; s += unroll) {
+ size_t u;
+ for (u = 0; u < unroll; ++u) {
+ size_t const uPosition = (position + (u * step)) & tableMask;
+ tableDecode[uPosition].baseValue = spread[s + u];
+ }
+ position = (position + (unroll * step)) & tableMask;
+ }
+ assert(position == 0);
+ }
+ } else {
+ U32 const tableMask = tableSize-1;
U32 const step = FSE_TABLESTEP(tableSize);
U32 s, position = 0;
for (s=0; s<maxSV1; s++) {
int i;
- for (i=0; i<normalizedCounter[s]; i++) {
+ int const n = normalizedCounter[s];
+ for (i=0; i<n; i++) {
tableDecode[position].baseValue = s;
position = (position + step) & tableMask;
while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */
@@ -414,7 +469,8 @@
}
/* Build Decoding table */
- { U32 u;
+ {
+ U32 u;
for (u=0; u<tableSize; u++) {
U32 const symbol = tableDecode[u].baseValue;
U32 const nextState = symbolNext[symbol]++;
@@ -423,7 +479,8 @@
assert(nbAdditionalBits[symbol] < 255);
tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol];
tableDecode[u].baseValue = baseValue[symbol];
- } }
+ }
+ }
}
@@ -435,7 +492,7 @@
const void* src, size_t srcSize,
const U32* baseValue, const U32* nbAdditionalBits,
const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
- int ddictIsCold, int nbSeq)
+ int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize)
{
switch(type)
{
@@ -467,7 +524,7 @@
size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, "");
RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, "");
- ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog);
+ ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
*DTablePtr = DTableSpace;
return headerSize;
}
@@ -520,7 +577,8 @@
ip, iend-ip,
LL_base, LL_bits,
LL_defaultDTable, dctx->fseEntropy,
- dctx->ddictIsCold, nbSeq);
+ dctx->ddictIsCold, nbSeq,
+ dctx->workspace, sizeof(dctx->workspace));
RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
ip += llhSize;
}
@@ -530,7 +588,8 @@
ip, iend-ip,
OF_base, OF_bits,
OF_defaultDTable, dctx->fseEntropy,
- dctx->ddictIsCold, nbSeq);
+ dctx->ddictIsCold, nbSeq,
+ dctx->workspace, sizeof(dctx->workspace));
RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
ip += ofhSize;
}
@@ -540,7 +599,8 @@
ip, iend-ip,
ML_base, ML_bits,
ML_defaultDTable, dctx->fseEntropy,
- dctx->ddictIsCold, nbSeq);
+ dctx->ddictIsCold, nbSeq,
+ dctx->workspace, sizeof(dctx->workspace));
RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
ip += mlhSize;
}
diff --git a/lib/decompress/zstd_decompress_block.h b/lib/decompress/zstd_decompress_block.h
index bf39b73..201d6a9 100644
--- a/lib/decompress/zstd_decompress_block.h
+++ b/lib/decompress/zstd_decompress_block.h
@@ -53,7 +53,7 @@
void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
const short* normalizedCounter, unsigned maxSymbolValue,
const U32* baseValue, const U32* nbAdditionalBits,
- unsigned tableLog);
+ unsigned tableLog, U32* wksp, size_t wkspSize);
#endif /* ZSTD_DEC_BLOCK_H */
diff --git a/lib/decompress/zstd_decompress_internal.h b/lib/decompress/zstd_decompress_internal.h
index 9ad96c5..1a5c7ee 100644
--- a/lib/decompress/zstd_decompress_internal.h
+++ b/lib/decompress/zstd_decompress_internal.h
@@ -72,6 +72,7 @@
} ZSTD_seqSymbol;
#define SEQSYMBOL_TABLE_SIZE(log) (1 + (1 << (log)))
+ #define ZSTD_FSE_WKSP_SIZE_U32 130
typedef struct {
ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */
@@ -79,6 +80,7 @@
ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */
U32 rep[ZSTD_REP_NUM];
+ U32 workspace[ZSTD_FSE_WKSP_SIZE_U32];
} ZSTD_entropyDTables_t;
typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,