blob: 5cae64ef2ecb39724aad36068489a0d466df323a [file] [log] [blame]
#pragma once
#include "Core.h"
#include <algorithm>
#include <math.h>
#include <assert.h>
#include <float.h>
double calcScore ( std::vector<int> const & bins, int balls );
void plot ( double n );
inline double ExpectedCollisions ( double balls, double bins )
{
return balls - bins + bins * pow(1 - 1/bins,balls);
}
double comparenorms ( double u1, double s1, double u2, double s2 );
void beta2norm ( double a, double b, double & u, double & s );
double chooseK ( int b, int k );
double chooseUpToK ( int n, int k );
inline uint32_t f3mix ( uint32_t k )
{
k ^= k >> 16;
k *= 0x85ebca6b;
k ^= k >> 13;
k *= 0xc2b2ae35;
k ^= k >> 16;
return k;
}
//-----------------------------------------------------------------------------
template< typename hashtype >
int CountCollisions ( std::vector<hashtype> const & hashes )
{
int collcount = 0;
std::vector<hashtype> temp = hashes;
std::sort(temp.begin(),temp.end());
for(size_t i = 1; i < hashes.size(); i++)
{
if(temp[i] == temp[i-1]) collcount++;
}
return collcount;
}
//-----------------------------------------------------------------------------
/*
template < class keytype, typename hashtype >
int PrintCollisions ( hashfunc<hashtype> hash, std::vector<keytype> & keys )
{
int collcount = 0;
typedef std::map<hashtype,keytype> htab;
htab tab;
for(size_t i = 1; i < keys.size(); i++)
{
keytype & k1 = keys[i];
hashtype h = hash(&k1,sizeof(k),0);
htab::iterator it = tab.find(h);
if(it != tab.end())
{
keytype & k2 = (*it).second;
printf("A: ");
printbits(&k1,sizeof(k1));
printf("B: ");
printbits(&k2,sizeof(k2));
}
else
{
htab.insert( htab::value_type(h,k);
}
}
return collcount;
}
*/
//----------------------------------------------------------------------------
template < typename hashtype >
bool testhashlist( std::vector<hashtype> & hashes, bool testColl, bool testDist, bool drawDiagram )
{
bool verbose = true;
bool result = true;
if(testColl)
{
size_t count = hashes.size();
double expected = (double(count) * double(count-1)) / pow(2.0,double(sizeof(hashtype) * 8 + 1));
if(verbose) printf("Testing collisions - Expected %8.2f, ",expected);
double collcount = 0;
collcount = CountCollisions(hashes);
if(verbose)
{
printf("actual %8.2f (%5.2fx) \n",collcount, collcount / expected);
}
else
{
double collscore = collcount / expected;
printf("Coll score %5.3f, ",collscore);
}
// 2x expected collisions = fail
if(double(collcount) / double(expected) > 2.0)
{
result = false;
}
}
//----------
if(testDist)
{
if(verbose) printf("Testing distribution - ");
if(drawDiagram) printf("\n");
TestDistribution(hashes,drawDiagram);
}
return result;
}
//-----------------------------------------------------------------------------
template < class keytype, typename hashtype >
bool testkeylist ( hashfunc<hashtype> hash, std::vector<keytype> & keys, bool testColl, bool testDist, bool drawDiagram )
{
int keycount = (int)keys.size();
std::vector<hashtype> hashes;
hashes.resize(keycount);
//printf("Hashing keyset");
for(int ikey = 0; ikey < keycount; ikey++)
{
keytype & k = keys[ikey];
//if(ikey % (keycount / 10) == 0) printf(".");
hashes[ikey] = hash(&k,sizeof(k),0);
}
//printf("\n");
bool result = testhashlist(hashes,testColl,testDist,drawDiagram);
return result;
}
//-----------------------------------------------------------------------------
template < typename hashtype >
bool testkeylist_string ( hashfunc<hashtype> hash, std::vector<std::string> & keys, bool testColl, bool testDist )
{
int keycount = (int)keys.size();
std::vector<hashtype> hashes;
hashes.resize(keycount);
//printf("Hashing keyset");
for(int ikey = 0; ikey < keycount; ikey++)
{
std::string & k = keys[ikey];
//if(ikey % (keycount / 10) == 0) printf(".");
hashes[ikey] = hash(&k[0],(int)k.size(),0);
}
//printf("\n");
bool result = testhashlist(hashes,testColl,testDist);
return result;
}
//-----------------------------------------------------------------------------
// Bytepair test - generate 16-bit indices from all possible non-overlapping
// 8-bit sections of the hash value, check distribution on all of them.
// This is a very good test for catching weak intercorrelations between bits -
// much harder to pass than the normal distribution test. However, it doesn't
// really model the normal usage of hash functions in hash table lookup, so
// I'm not sure it's that useful (and hash functions that fail this test but
// pass the normal distribution test still work well in practice)
template < typename hashtype >
double TestDistributionBytepairs ( std::vector<hashtype> & hashes, bool drawDiagram )
{
const int nbytes = sizeof(hashtype);
const int nbits = nbytes * 8;
const int nbins = 65536;
std::vector<int> bins(nbins,0);
double worst = 0;
for(int a = 0; a < nbits; a++)
{
if(drawDiagram) if((a % 8 == 0) && (a > 0)) printf("\n");
if(drawDiagram) printf("[");
for(int b = 0; b < nbits; b++)
{
if(drawDiagram) if((b % 8 == 0) && (b > 0)) printf(" ");
bins.clear();
bins.resize(nbins,0);
for(size_t i = 0; i < hashes.size(); i++)
{
hashtype & hash = hashes[i];
uint32_t pa = window(&hash,sizeof(hash),a,8);
uint32_t pb = window(&hash,sizeof(hash),b,8);
bins[pa | (pb << 8)]++;
}
double s = calcScore(bins,hashes.size());
if(drawDiagram) plot(s);
if(s > worst)
{
worst = s;
}
}
if(drawDiagram) printf("]\n");
}
return worst;
}
//----------------------------------------------------------------------------
// Measure the distribution "score" for each possible N-bit span up to 16 bits
// and draw a nice graph of the output. 'X' in graph = 10% deviation from ideal.
template< typename hashtype >
double TestDistribution ( std::vector<hashtype> & hashes, bool drawDiagram )
{
bool verbose = false;
const int nbits = sizeof(hashtype) * 8;
const int maxwidth = 20;
std::vector<int> bins;
double worst = 0;
int worstStart = -1;
int worstWidth = -1;
for(int width = 1; width <= maxwidth; width++)
{
const int bincount = (1 << width);
//If we don't have enough keys to get 2 per bin, skip the test
//if(double(hashes.size()) / double(bincount) < 2.0) continue;
if(drawDiagram) printf("%2d - [",width);
for(int start = 0; start < nbits; start++)
{
bins.clear();
bins.resize(bincount, 0);
for(size_t j = 0; j < hashes.size(); j++)
{
hashtype & hash = hashes[j];
uint32_t index = window(&hash,sizeof(hash),start,width);
bins[index]++;
}
double n = calcScore(bins,(int)hashes.size());
if(n > worst)
{
worst = n;
worstStart = start;
worstWidth = width;
}
if(drawDiagram) plot(n);
}
if(drawDiagram) printf("]\n");
}
if(verbose)
{
printf("Worst distribution is for (%d:%d) - %f\n",worstStart,(worstStart+worstWidth-1)%32,worst);
}
else
{
printf("Dist score %6.3f\n",(1.0 - worst) * 100);
}
return worst;
}
//-----------------------------------------------------------------------------
// Simplified test - only check 64k distributions, and only on byte boundaries
template < typename hashtype >
void TestDistributionFast ( std::vector<hashtype> & hashes, double & dworst, double & davg )
{
const int nbits = sizeof(hashtype) * 8;
const int nbins = 65536;
std::vector<int> bins(nbins,0);
dworst = -1.0e90;
davg = 0;
for(int start = 0; start < nbits; start += 8)
{
bins.clear();
bins.resize(nbins,0);
for(size_t j = 0; j < hashes.size(); j++)
{
hashtype & hash = hashes[j];
uint32_t index = window(&hash,sizeof(hash),start,16);
bins[index]++;
}
double n = calcScore(bins,(int)hashes.size());
davg += n;
if(n > dworst) dworst = n;
}
davg /= double(nbits/8);
}
//-----------------------------------------------------------------------------
/*
struct Stats
{
enum mode
{
AVALANCHE,
HISTOGRAM,
};
Stats ( int mode, std::vector<int> const & bins, int balls )
{
switch(mode)
{
case AVALANCHE:
calcAvalanche(bins,balls);
break;
case HISTOGRAM:
calcHistogram(bins,balls);
break;
default:
assert(false);
break;
}
}
//----------
// Histogram mode
void calcHistogram ( std::vector<int> const & bins, int balls )
{
m_nbins = (int)bins.size();
m_nballs = balls;
m_mean = 0;
m_rms = 0;
m_sigma = 0;
m_max = -DBL_MAX;
m_min = DBL_MAX;
for(size_t i = 0; i < bins.size(); i++)
{
double x = bins[i];
m_mean += x;
m_rms += x*x;
m_max = x > m_max ? x : m_max;
m_min = x < m_min ? x : m_min;
}
m_mean /= m_nbins;
m_rms /= m_nbins;
m_rms = sqrt(m_rms);
for(size_t i = 0; i < bins.size(); i++)
{
double d = bins[i] - m_mean;
m_sigma += d*d;
}
m_sigma /= m_nbins;
m_sigma = sqrt(m_sigma);
}
//----------
// Normalized standard deviation
double calcNSD ( std::vector<int> const & bins, int balls )
{
double n = (int)bins.size();
double k = balls;
double p = 1.0/n;
double u = k*p;
double s = sqrt(k*p*(1-p));
double c = 0;
for(size_t i = 0; i < bins.size(); i++)
{
double d = bins[i];
d = (d-u)/s;
c += d*d;
}
m_nsd = sqrt(c / m_nbins);
}
double calcScore ( std::vector<int> const & bins, int balls )
{
double n = (int)bins.size();
double k = balls;
// compute rms value
double r = 0;
for(size_t i = 0; i < bins.size(); i++)
{
double b = bins[i];
r += b*b;
}
r = sqrt(r / n);
// compute fill factor
double f = (k*k - 1) / (n*r*r - k);
// rescale to (0,1) with 0 = good, 1 = bad
m_score = 1 - (f / n);
}
//----------
// Avalanche statistics - convert each table entry to a bias value
// and compute stats based on that.
void calcAvalanche ( std::vector<int> const & bins, int balls )
{
m_nbins = (int)bins.size();
m_nballs = balls;
m_mean = 0;
m_rms = 0;
m_sigma = 0;
m_max = -DBL_MAX;
m_min = DBL_MAX;
m_nbad = 0;
for(size_t i = 0; i < bins.size(); i++)
{
double x = (bins[i] / m_nballs) * 2 - 1;
m_mean += x;
m_rms += x*x;
x = fabs(x);
if(x > 0.7) m_nbad++;
m_max = x > m_max ? x : m_max;
m_min = x < m_min ? x : m_min;
}
m_mean /= m_nbins;
m_rms /= m_nbins;
m_rms = sqrt(m_rms);
for(size_t i = 0; i < bins.size(); i++)
{
double x = (bins[i] / m_nballs) * 2 - 1;
double d = x - m_mean;
m_sigma += d*d;
}
m_sigma /= m_nbins;
m_sigma = sqrt(m_sigma);
}
double m_nbins;
double m_nballs;
double m_mean;
double m_rms;
double m_sigma;
double m_nsd;
double m_score;
double m_nbad;
double m_max;
double m_min;
};
*/
//-----------------------------------------------------------------------------