tanjent@gmail.com | f3b7897 | 2012-03-01 03:38:55 +0000 | [diff] [blame] | 1 | #pragma once |
| 2 | |
| 3 | #include "Types.h" |
| 4 | |
| 5 | #include <math.h> |
| 6 | #include <vector> |
| 7 | #include <map> |
| 8 | #include <algorithm> // for std::sort |
| 9 | #include <string.h> // for memset |
| 10 | #include <stdio.h> // for printf |
| 11 | |
| 12 | double calcScore ( const int * bins, const int bincount, const int ballcount ); |
| 13 | |
| 14 | void plot ( double n ); |
| 15 | |
| 16 | inline double ExpectedCollisions ( double balls, double bins ) |
| 17 | { |
| 18 | return balls - bins + bins * pow(1 - 1/bins,balls); |
| 19 | } |
| 20 | |
| 21 | double chooseK ( int b, int k ); |
| 22 | double chooseUpToK ( int n, int k ); |
| 23 | |
| 24 | //----------------------------------------------------------------------------- |
| 25 | |
| 26 | inline uint32_t f3mix ( uint32_t k ) |
| 27 | { |
| 28 | k ^= k >> 16; |
| 29 | k *= 0x85ebca6b; |
| 30 | k ^= k >> 13; |
| 31 | k *= 0xc2b2ae35; |
| 32 | k ^= k >> 16; |
| 33 | |
| 34 | return k; |
| 35 | } |
| 36 | |
| 37 | //----------------------------------------------------------------------------- |
| 38 | // Sort the hash list, count the total number of collisions and return |
| 39 | // the first N collisions for further processing |
| 40 | |
| 41 | template< typename hashtype > |
| 42 | int FindCollisions ( std::vector<hashtype> & hashes, |
| 43 | HashSet<hashtype> & collisions, |
| 44 | int maxCollisions ) |
| 45 | { |
| 46 | int collcount = 0; |
| 47 | |
| 48 | std::sort(hashes.begin(),hashes.end()); |
| 49 | |
| 50 | for(size_t i = 1; i < hashes.size(); i++) |
| 51 | { |
| 52 | if(hashes[i] == hashes[i-1]) |
| 53 | { |
| 54 | collcount++; |
| 55 | |
| 56 | if((int)collisions.size() < maxCollisions) |
| 57 | { |
| 58 | collisions.insert(hashes[i]); |
| 59 | } |
| 60 | } |
| 61 | } |
| 62 | |
| 63 | return collcount; |
| 64 | } |
| 65 | |
| 66 | //----------------------------------------------------------------------------- |
| 67 | |
| 68 | template < class keytype, typename hashtype > |
| 69 | int PrintCollisions ( hashfunc<hashtype> hash, std::vector<keytype> & keys ) |
| 70 | { |
| 71 | int collcount = 0; |
| 72 | |
| 73 | typedef std::map<hashtype,keytype> htab; |
| 74 | htab tab; |
| 75 | |
| 76 | for(size_t i = 1; i < keys.size(); i++) |
| 77 | { |
| 78 | keytype & k1 = keys[i]; |
| 79 | |
| 80 | hashtype h = hash(&k1,sizeof(keytype),0); |
| 81 | |
| 82 | typename htab::iterator it = tab.find(h); |
| 83 | |
| 84 | if(it != tab.end()) |
| 85 | { |
| 86 | keytype & k2 = (*it).second; |
| 87 | |
| 88 | printf("A: "); |
| 89 | printbits(&k1,sizeof(keytype)); |
| 90 | printf("B: "); |
| 91 | printbits(&k2,sizeof(keytype)); |
| 92 | } |
| 93 | else |
| 94 | { |
| 95 | tab.insert( std::make_pair(h,k1) ); |
| 96 | } |
| 97 | } |
| 98 | |
| 99 | return collcount; |
| 100 | } |
| 101 | |
| 102 | //---------------------------------------------------------------------------- |
| 103 | // Measure the distribution "score" for each possible N-bit span up to 20 bits |
| 104 | |
| 105 | template< typename hashtype > |
| 106 | double TestDistribution ( std::vector<hashtype> & hashes, bool drawDiagram ) |
| 107 | { |
| 108 | printf("Testing distribution - "); |
| 109 | |
| 110 | if(drawDiagram) printf("\n"); |
| 111 | |
| 112 | const int hashbits = sizeof(hashtype) * 8; |
| 113 | |
| 114 | int maxwidth = 20; |
| 115 | |
| 116 | // We need at least 5 keys per bin to reliably test distribution biases |
| 117 | // down to 1%, so don't bother to test sparser distributions than that |
| 118 | |
| 119 | while(double(hashes.size()) / double(1 << maxwidth) < 5.0) |
| 120 | { |
| 121 | maxwidth--; |
| 122 | } |
| 123 | |
| 124 | std::vector<int> bins; |
| 125 | bins.resize(1 << maxwidth); |
| 126 | |
| 127 | double worst = 0; |
| 128 | int worstStart = -1; |
| 129 | int worstWidth = -1; |
| 130 | |
| 131 | for(int start = 0; start < hashbits; start++) |
| 132 | { |
| 133 | int width = maxwidth; |
| 134 | int bincount = (1 << width); |
| 135 | |
| 136 | memset(&bins[0],0,sizeof(int)*bincount); |
| 137 | |
| 138 | for(size_t j = 0; j < hashes.size(); j++) |
| 139 | { |
| 140 | hashtype & hash = hashes[j]; |
| 141 | |
| 142 | uint32_t index = window(&hash,sizeof(hash),start,width); |
| 143 | |
| 144 | bins[index]++; |
| 145 | } |
| 146 | |
| 147 | // Test the distribution, then fold the bins in half, |
| 148 | // repeat until we're down to 256 bins |
| 149 | |
| 150 | if(drawDiagram) printf("["); |
| 151 | |
| 152 | while(bincount >= 256) |
| 153 | { |
| 154 | double n = calcScore(&bins[0],bincount,(int)hashes.size()); |
| 155 | |
| 156 | if(drawDiagram) plot(n); |
| 157 | |
| 158 | if(n > worst) |
| 159 | { |
| 160 | worst = n; |
| 161 | worstStart = start; |
| 162 | worstWidth = width; |
| 163 | } |
| 164 | |
| 165 | width--; |
| 166 | bincount /= 2; |
| 167 | |
| 168 | if(width < 8) break; |
| 169 | |
| 170 | for(int i = 0; i < bincount; i++) |
| 171 | { |
| 172 | bins[i] += bins[i+bincount]; |
| 173 | } |
| 174 | } |
| 175 | |
| 176 | if(drawDiagram) printf("]\n"); |
| 177 | } |
| 178 | |
| 179 | double pct = worst * 100.0; |
| 180 | |
| 181 | printf("Worst bias is the %3d-bit window at bit %3d - %5.3f%%",worstWidth,worstStart,pct); |
| 182 | if(pct >= 1.0) printf(" !!!!! "); |
| 183 | printf("\n"); |
| 184 | |
| 185 | return worst; |
| 186 | } |
| 187 | |
| 188 | //---------------------------------------------------------------------------- |
| 189 | |
| 190 | template < typename hashtype > |
| 191 | bool TestHashList ( std::vector<hashtype> & hashes, std::vector<hashtype> & collisions, bool testDist, bool drawDiagram ) |
| 192 | { |
| 193 | bool result = true; |
| 194 | |
| 195 | { |
| 196 | size_t count = hashes.size(); |
| 197 | |
| 198 | double expected = (double(count) * double(count-1)) / pow(2.0,double(sizeof(hashtype) * 8 + 1)); |
| 199 | |
| 200 | printf("Testing collisions - Expected %8.2f, ",expected); |
| 201 | |
| 202 | double collcount = 0; |
| 203 | |
| 204 | HashSet<hashtype> collisions; |
| 205 | |
| 206 | collcount = FindCollisions(hashes,collisions,1000); |
| 207 | |
| 208 | printf("actual %8.2f (%5.2fx)",collcount, collcount / expected); |
| 209 | |
| 210 | if(sizeof(hashtype) == sizeof(uint32_t)) |
| 211 | { |
| 212 | // 2x expected collisions = fail |
| 213 | |
| 214 | // #TODO - collision failure cutoff needs to be expressed as a standard deviation instead |
| 215 | // of a scale factor, otherwise we fail erroneously if there are a small expected number |
| 216 | // of collisions |
| 217 | |
| 218 | if(double(collcount) / double(expected) > 2.0) |
| 219 | { |
| 220 | printf(" !!!!! "); |
| 221 | result = false; |
| 222 | } |
| 223 | } |
| 224 | else |
| 225 | { |
| 226 | // For all hashes larger than 32 bits, _any_ collisions are a failure. |
| 227 | |
| 228 | if(collcount > 0) |
| 229 | { |
| 230 | printf(" !!!!! "); |
| 231 | result = false; |
| 232 | } |
| 233 | } |
| 234 | |
| 235 | printf("\n"); |
| 236 | } |
| 237 | |
| 238 | //---------- |
| 239 | |
| 240 | if(testDist) |
| 241 | { |
| 242 | TestDistribution(hashes,drawDiagram); |
| 243 | } |
| 244 | |
| 245 | return result; |
| 246 | } |
| 247 | |
| 248 | //---------- |
| 249 | |
| 250 | template < typename hashtype > |
| 251 | bool TestHashList ( std::vector<hashtype> & hashes, bool /*testColl*/, bool testDist, bool drawDiagram ) |
| 252 | { |
| 253 | std::vector<hashtype> collisions; |
| 254 | |
| 255 | return TestHashList(hashes,collisions,testDist,drawDiagram); |
| 256 | } |
| 257 | |
| 258 | //----------------------------------------------------------------------------- |
| 259 | |
| 260 | template < class keytype, typename hashtype > |
| 261 | bool TestKeyList ( hashfunc<hashtype> hash, std::vector<keytype> & keys, bool testColl, bool testDist, bool drawDiagram ) |
| 262 | { |
| 263 | int keycount = (int)keys.size(); |
| 264 | |
| 265 | std::vector<hashtype> hashes; |
| 266 | |
| 267 | hashes.resize(keycount); |
| 268 | |
| 269 | printf("Hashing"); |
| 270 | |
| 271 | for(int i = 0; i < keycount; i++) |
| 272 | { |
| 273 | if(i % (keycount / 10) == 0) printf("."); |
| 274 | |
| 275 | keytype & k = keys[i]; |
| 276 | |
| 277 | hash(&k,sizeof(k),0,&hashes[i]); |
| 278 | } |
| 279 | |
| 280 | printf("\n"); |
| 281 | |
| 282 | bool result = TestHashList(hashes,testColl,testDist,drawDiagram); |
| 283 | |
| 284 | printf("\n"); |
| 285 | |
| 286 | return result; |
| 287 | } |
| 288 | |
| 289 | //----------------------------------------------------------------------------- |
| 290 | // Bytepair test - generate 16-bit indices from all possible non-overlapping |
| 291 | // 8-bit sections of the hash value, check distribution on all of them. |
| 292 | |
| 293 | // This is a very good test for catching weak intercorrelations between bits - |
| 294 | // much harder to pass than the normal distribution test. However, it doesn't |
| 295 | // really model the normal usage of hash functions in hash table lookup, so |
| 296 | // I'm not sure it's that useful (and hash functions that fail this test but |
| 297 | // pass the normal distribution test still work well in practice) |
| 298 | |
| 299 | template < typename hashtype > |
| 300 | double TestDistributionBytepairs ( std::vector<hashtype> & hashes, bool drawDiagram ) |
| 301 | { |
| 302 | const int nbytes = sizeof(hashtype); |
| 303 | const int hashbits = nbytes * 8; |
| 304 | |
| 305 | const int nbins = 65536; |
| 306 | |
| 307 | std::vector<int> bins(nbins,0); |
| 308 | |
| 309 | double worst = 0; |
| 310 | |
| 311 | for(int a = 0; a < hashbits; a++) |
| 312 | { |
| 313 | if(drawDiagram) if((a % 8 == 0) && (a > 0)) printf("\n"); |
| 314 | |
| 315 | if(drawDiagram) printf("["); |
| 316 | |
| 317 | for(int b = 0; b < hashbits; b++) |
| 318 | { |
| 319 | if(drawDiagram) if((b % 8 == 0) && (b > 0)) printf(" "); |
| 320 | |
| 321 | bins.clear(); |
| 322 | bins.resize(nbins,0); |
| 323 | |
| 324 | for(size_t i = 0; i < hashes.size(); i++) |
| 325 | { |
| 326 | hashtype & hash = hashes[i]; |
| 327 | |
| 328 | uint32_t pa = window(&hash,sizeof(hash),a,8); |
| 329 | uint32_t pb = window(&hash,sizeof(hash),b,8); |
| 330 | |
| 331 | bins[pa | (pb << 8)]++; |
| 332 | } |
| 333 | |
| 334 | double s = calcScore(bins,bins.size(),hashes.size()); |
| 335 | |
| 336 | if(drawDiagram) plot(s); |
| 337 | |
| 338 | if(s > worst) |
| 339 | { |
| 340 | worst = s; |
| 341 | } |
| 342 | } |
| 343 | |
| 344 | if(drawDiagram) printf("]\n"); |
| 345 | } |
| 346 | |
| 347 | return worst; |
| 348 | } |
| 349 | |
| 350 | //----------------------------------------------------------------------------- |
| 351 | // Simplified test - only check 64k distributions, and only on byte boundaries |
| 352 | |
| 353 | template < typename hashtype > |
| 354 | void TestDistributionFast ( std::vector<hashtype> & hashes, double & dworst, double & davg ) |
| 355 | { |
| 356 | const int hashbits = sizeof(hashtype) * 8; |
| 357 | const int nbins = 65536; |
| 358 | |
| 359 | std::vector<int> bins(nbins,0); |
| 360 | |
| 361 | dworst = -1.0e90; |
| 362 | davg = 0; |
| 363 | |
| 364 | for(int start = 0; start < hashbits; start += 8) |
| 365 | { |
| 366 | bins.clear(); |
| 367 | bins.resize(nbins,0); |
| 368 | |
| 369 | for(size_t j = 0; j < hashes.size(); j++) |
| 370 | { |
| 371 | hashtype & hash = hashes[j]; |
| 372 | |
| 373 | uint32_t index = window(&hash,sizeof(hash),start,16); |
| 374 | |
| 375 | bins[index]++; |
| 376 | } |
| 377 | |
| 378 | double n = calcScore(&bins.front(),(int)bins.size(),(int)hashes.size()); |
| 379 | |
| 380 | davg += n; |
| 381 | |
| 382 | if(n > dworst) dworst = n; |
| 383 | } |
| 384 | |
| 385 | davg /= double(hashbits/8); |
| 386 | } |
| 387 | |
| 388 | //----------------------------------------------------------------------------- |