Add array to keep track of frequency within active segment, fix malloc bug, update benchmarking result
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
index 478d879..07d65b0 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
@@ -14,42 +14,46 @@
###Benchmarking Result:
+d=8
+f=23
+freq[i] = 0 when dmer added to best segment
+
github:
| Algorithm | Speed(sec) | Compression Ratio |
-| ------------------|:-------------:| ------------------:|
-| nodict | 0.000004 | 2.999642 |
-| random | 0.148247 | 8.786957 |
-| cover | 56.331553 | 10.641263 |
-| legacy | 0.917595 | 8.989482 |
-| fastCover(opt) | 13.169979 | 10.215174 |
-| fastCover(k=200) | 2.692406 | 8.657219 |
+| ----------------- | ------------- | ------------------ |
+| nodict | 0.000007 | 2.999642 |
+| random | 0.150258 | 8.786957 |
+| cover | 60.388853 | 10.641263 |
+| legacy | 0.965050 | 8.989482 |
+| fastCover(opt) | 84.968131 | 10.614747 |
+| fastCover(k=200) | 6.465490 | 9.484150 |
hg-commands
| Algorithm | Speed(sec) | Compression Ratio |
-| ----------------- |:-------------:| ------------------:|
-| nodict | 0.000007 | 2.425291 |
-| random | 0.093990 | 3.489515 |
-| cover | 58.602385 | 4.131136 |
-| legacy | 0.865683 | 3.911896 |
-| fastCover(opt) | 9.404134 | 3.977229 |
-| fastCover(k=200) | 1.037434 | 3.810326 |
+| ----------------- | ------------- | ------------------ |
+| nodict | 0.000005 | 2.425291 |
+| random | 0.084348 | 3.489515 |
+| cover | 60.144894 | 4.131136 |
+| legacy | 0.831981 | 3.911896 |
+| fastCover(opt) | 59.030437 | 4.157595 |
+| fastCover(k=200) | 3.702932 | 4.134222 |
hg-changelog
| Algorithm | Speed(sec) | Compression Ratio |
-| ----------------- |:-------------:| ------------------:|
-| nodict | 0.000022 | 1.377613 |
-| random | 0.551539 | 2.096785 |
-| cover | 221.370056 | 2.188654 |
-| legacy | 2.405923 | 2.058273 |
-| fastCover(opt) | 49.526246 | 2.124185 |
-| fastCover(k=200) | 9.746872 | 2.114674 |
+| ----------------- | ------------- | ------------------ |
+| nodict | 0.000004 | 1.377613 |
+| random | 0.555964 | 2.096785 |
+| cover | 214.423753 | 2.188654 |
+| legacy | 2.180249 | 2.058273 |
+| fastCover(opt) | 102.261452 | 2.180347 |
+| fastCover(k=200) | 11.81039 | 2.170673 |
hg-manifest
| Algorithm | Speed(sec) | Compression Ratio |
-| ----------------- |:-------------:| ------------------:|
-| nodict | 0.000019 | 1.866385 |
-| random | 1.083536 | 2.309485 |
-| cover | 928.894887 | 2.582597 |
-| legacy | 9.110371 | 2.506775 |
-| fastCover(opt) | 116.508270 | 2.525689 |
-| fastCover(k=200) | 12.176555 | 2.472221 |
+| ----------------- | ------------- | ------------------ |
+| nodict | 0.000006 | 1.866385 |
+| random | 1.063974 | 2.309485 |
+| cover | 909.101849 | 2.582597 |
+| legacy | 8.706580 | 2.506775 |
+| fastCover(opt) | 188.598079 | 2.596761 |
+| fastCover(k=200) | 13.392734 | 2.592985 |
diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.c b/contrib/experimental_dict_builders/fastCover/fastCover.c
index abd592c..6f990e0 100644
--- a/contrib/experimental_dict_builders/fastCover/fastCover.c
+++ b/contrib/experimental_dict_builders/fastCover/fastCover.c
@@ -48,7 +48,7 @@
/*-*************************************
-* Hash Function
+* Hash Functions
***************************************/
static const U64 prime6bytes = 227718039650203ULL;
static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; }
@@ -58,6 +58,7 @@
static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+
/**
* Hash the d-byte value pointed to by p and mod 2^f
*/
@@ -140,29 +141,41 @@
activeSegment.begin = begin;
activeSegment.end = begin;
activeSegment.score = 0;
- /* Slide the activeSegment through the whole epoch.
- * Save the best segment in bestSegment.
- */
- while (activeSegment.end < end) {
- /* Get hash value of current dmer */
- const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, parameters.f, ctx->d);
- /* Add frequency of this index to score */
- activeSegment.score += freqs[index];
- /* Increment end of segment */
- activeSegment.end += 1;
- /* If the window is now too large, drop the first position */
- if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
- /* Get hash value of the dmer to be eliminated from active segment */
- const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, parameters.f, ctx->d);
- /* Subtract frequency of this index from score */
- activeSegment.score -= freqs[delIndex];
- /* Increment start of segment */
- activeSegment.begin += 1;
+ {
+ /* Keep track of number of times an index has been seen in current segment */
+ U16* currfreqs =(U16 *)malloc((1 << parameters.f) * sizeof(U16));
+ memset(currfreqs, 0, (1 << parameters.f) * sizeof(*currfreqs));
+ /* Slide the activeSegment through the whole epoch.
+ * Save the best segment in bestSegment.
+ */
+ while (activeSegment.end < end) {
+ /* Get hash value of current dmer */
+ const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, parameters.f, ctx->d);
+ /* Add frequency of this index to score if this is the first occurence of index in active segment */
+ if (currfreqs[index] == 0) {
+ activeSegment.score += freqs[index];
+ }
+ currfreqs[index] += 1;
+ /* Increment end of segment */
+ activeSegment.end += 1;
+ /* If the window is now too large, drop the first position */
+ if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
+ /* Get hash value of the dmer to be eliminated from active segment */
+ const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, parameters.f, ctx->d);
+ currfreqs[delIndex] -= 1;
+ /* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */
+ if (currfreqs[delIndex] == 0) {
+ activeSegment.score -= freqs[delIndex];
+ }
+ /* Increment start of segment */
+ activeSegment.begin += 1;
+ }
+ /* If this segment is the best so far save it */
+ if (activeSegment.score > bestSegment.score) {
+ bestSegment = activeSegment;
+ }
}
- /* If this segment is the best so far save it */
- if (activeSegment.score > bestSegment.score) {
- bestSegment = activeSegment;
- }
+ free(currfreqs);
}
{
/* Trim off the zero frequency head and tail from the segment. */
@@ -185,7 +198,7 @@
U32 pos;
for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, parameters.f, ctx->d);
- freqs[i] = freqs[i]/2;
+ freqs[i] = 0;
}
}
return bestSegment;
@@ -245,12 +258,12 @@
/**
* Calculate for frequency of hash value of each dmer in ctx->samples
*/
-static void FASTCOVER_getFrequency(U32 *freqs, unsigned f, FASTCOVER_ctx_t *ctx){
+static void FASTCOVER_computeFrequency(U32 *freqs, unsigned f, FASTCOVER_ctx_t *ctx){
/* inCurrSample keeps track of this hash value has already be seen in previous dmers in the same sample*/
- size_t* inCurrSample = (size_t *)malloc((1<<f)*sizeof(size_t));
+ BYTE* inCurrSample = (BYTE *)malloc((1 << f) * sizeof(BYTE));
size_t start; /* start of current dmer */
for (unsigned i = 0; i < ctx->nbTrainSamples; i++) {
- memset(inCurrSample, 0, (1 << f)); /* Reset inCurrSample for each sample */
+ memset(inCurrSample, 0, (1 << f) * sizeof(*inCurrSample)); /* Reset inCurrSample for each sample */
size_t currSampleStart = ctx->offsets[i];
size_t currSampleEnd = ctx->offsets[i+1];
start = currSampleStart;
@@ -338,7 +351,7 @@
memset(ctx->freqs, 0, (1 << f) * sizeof(U32));
DISPLAYLEVEL(2, "Computing frequencies\n");
- FASTCOVER_getFrequency(ctx->freqs, f, ctx);
+ FASTCOVER_computeFrequency(ctx->freqs, f, ctx);
return 1;
}
diff --git a/contrib/experimental_dict_builders/fastCover/main.c b/contrib/experimental_dict_builders/fastCover/main.c
index 260eeb2..f286b05 100644
--- a/contrib/experimental_dict_builders/fastCover/main.c
+++ b/contrib/experimental_dict_builders/fastCover/main.c
@@ -165,7 +165,7 @@
params.splitPoint = (double)split/100;
/* Build dictionary */
- sampleInfo* info= getSampleInfo(filenameTable,
+ sampleInfo* info = getSampleInfo(filenameTable,
filenameIdx, blockSize, maxDictSize, zParams.notificationLevel);
operationResult = FASTCOVER_trainFromFiles(outputFile, info, maxDictSize, ¶ms);
diff --git a/contrib/experimental_dict_builders/randomDictBuilder/main.c b/contrib/experimental_dict_builders/randomDictBuilder/main.c
index 3f3a6ca..3ad8857 100644
--- a/contrib/experimental_dict_builders/randomDictBuilder/main.c
+++ b/contrib/experimental_dict_builders/randomDictBuilder/main.c
@@ -149,7 +149,7 @@
params.zParams = zParams;
params.k = k;
- sampleInfo* info= getSampleInfo(filenameTable,
+ sampleInfo* info = getSampleInfo(filenameTable,
filenameIdx, blockSize, maxDictSize, zParams.notificationLevel);
operationResult = RANDOM_trainFromFiles(outputFile, info, maxDictSize, ¶ms);