blob: 3565e80c4f692245fd9db48989acdb1475487bd8 [file] [log] [blame]
tanjent@gmail.comf3b78972012-03-01 03:38:55 +00001#pragma once
2
3#include "Types.h"
4
5#include <math.h>
6#include <vector>
7#include <map>
8#include <algorithm> // for std::sort
9#include <string.h> // for memset
10#include <stdio.h> // for printf
11
12double calcScore ( const int * bins, const int bincount, const int ballcount );
13
14void plot ( double n );
15
16inline double ExpectedCollisions ( double balls, double bins )
17{
18 return balls - bins + bins * pow(1 - 1/bins,balls);
19}
20
21double chooseK ( int b, int k );
22double chooseUpToK ( int n, int k );
23
24//-----------------------------------------------------------------------------
25
26inline uint32_t f3mix ( uint32_t k )
27{
28 k ^= k >> 16;
29 k *= 0x85ebca6b;
30 k ^= k >> 13;
31 k *= 0xc2b2ae35;
32 k ^= k >> 16;
33
34 return k;
35}
36
37//-----------------------------------------------------------------------------
38// Sort the hash list, count the total number of collisions and return
39// the first N collisions for further processing
40
41template< typename hashtype >
42int FindCollisions ( std::vector<hashtype> & hashes,
43 HashSet<hashtype> & collisions,
44 int maxCollisions )
45{
46 int collcount = 0;
47
48 std::sort(hashes.begin(),hashes.end());
49
50 for(size_t i = 1; i < hashes.size(); i++)
51 {
52 if(hashes[i] == hashes[i-1])
53 {
54 collcount++;
55
56 if((int)collisions.size() < maxCollisions)
57 {
58 collisions.insert(hashes[i]);
59 }
60 }
61 }
62
63 return collcount;
64}
65
66//-----------------------------------------------------------------------------
67
68template < class keytype, typename hashtype >
69int PrintCollisions ( hashfunc<hashtype> hash, std::vector<keytype> & keys )
70{
71 int collcount = 0;
72
73 typedef std::map<hashtype,keytype> htab;
74 htab tab;
75
76 for(size_t i = 1; i < keys.size(); i++)
77 {
78 keytype & k1 = keys[i];
79
80 hashtype h = hash(&k1,sizeof(keytype),0);
81
82 typename htab::iterator it = tab.find(h);
83
84 if(it != tab.end())
85 {
86 keytype & k2 = (*it).second;
87
88 printf("A: ");
89 printbits(&k1,sizeof(keytype));
90 printf("B: ");
91 printbits(&k2,sizeof(keytype));
92 }
93 else
94 {
95 tab.insert( std::make_pair(h,k1) );
96 }
97 }
98
99 return collcount;
100}
101
102//----------------------------------------------------------------------------
103// Measure the distribution "score" for each possible N-bit span up to 20 bits
104
105template< typename hashtype >
106double TestDistribution ( std::vector<hashtype> & hashes, bool drawDiagram )
107{
108 printf("Testing distribution - ");
109
110 if(drawDiagram) printf("\n");
111
112 const int hashbits = sizeof(hashtype) * 8;
113
114 int maxwidth = 20;
115
116 // We need at least 5 keys per bin to reliably test distribution biases
117 // down to 1%, so don't bother to test sparser distributions than that
118
119 while(double(hashes.size()) / double(1 << maxwidth) < 5.0)
120 {
121 maxwidth--;
122 }
123
124 std::vector<int> bins;
125 bins.resize(1 << maxwidth);
126
127 double worst = 0;
128 int worstStart = -1;
129 int worstWidth = -1;
130
131 for(int start = 0; start < hashbits; start++)
132 {
133 int width = maxwidth;
134 int bincount = (1 << width);
135
136 memset(&bins[0],0,sizeof(int)*bincount);
137
138 for(size_t j = 0; j < hashes.size(); j++)
139 {
140 hashtype & hash = hashes[j];
141
142 uint32_t index = window(&hash,sizeof(hash),start,width);
143
144 bins[index]++;
145 }
146
147 // Test the distribution, then fold the bins in half,
148 // repeat until we're down to 256 bins
149
150 if(drawDiagram) printf("[");
151
152 while(bincount >= 256)
153 {
154 double n = calcScore(&bins[0],bincount,(int)hashes.size());
155
156 if(drawDiagram) plot(n);
157
158 if(n > worst)
159 {
160 worst = n;
161 worstStart = start;
162 worstWidth = width;
163 }
164
165 width--;
166 bincount /= 2;
167
168 if(width < 8) break;
169
170 for(int i = 0; i < bincount; i++)
171 {
172 bins[i] += bins[i+bincount];
173 }
174 }
175
176 if(drawDiagram) printf("]\n");
177 }
178
179 double pct = worst * 100.0;
180
181 printf("Worst bias is the %3d-bit window at bit %3d - %5.3f%%",worstWidth,worstStart,pct);
182 if(pct >= 1.0) printf(" !!!!! ");
183 printf("\n");
184
185 return worst;
186}
187
188//----------------------------------------------------------------------------
189
190template < typename hashtype >
191bool TestHashList ( std::vector<hashtype> & hashes, std::vector<hashtype> & collisions, bool testDist, bool drawDiagram )
192{
193 bool result = true;
194
195 {
196 size_t count = hashes.size();
197
198 double expected = (double(count) * double(count-1)) / pow(2.0,double(sizeof(hashtype) * 8 + 1));
199
200 printf("Testing collisions - Expected %8.2f, ",expected);
201
202 double collcount = 0;
203
204 HashSet<hashtype> collisions;
205
206 collcount = FindCollisions(hashes,collisions,1000);
207
208 printf("actual %8.2f (%5.2fx)",collcount, collcount / expected);
209
210 if(sizeof(hashtype) == sizeof(uint32_t))
211 {
212 // 2x expected collisions = fail
213
214 // #TODO - collision failure cutoff needs to be expressed as a standard deviation instead
215 // of a scale factor, otherwise we fail erroneously if there are a small expected number
216 // of collisions
217
218 if(double(collcount) / double(expected) > 2.0)
219 {
220 printf(" !!!!! ");
221 result = false;
222 }
223 }
224 else
225 {
226 // For all hashes larger than 32 bits, _any_ collisions are a failure.
227
228 if(collcount > 0)
229 {
230 printf(" !!!!! ");
231 result = false;
232 }
233 }
234
235 printf("\n");
236 }
237
238 //----------
239
240 if(testDist)
241 {
242 TestDistribution(hashes,drawDiagram);
243 }
244
245 return result;
246}
247
248//----------
249
250template < typename hashtype >
251bool TestHashList ( std::vector<hashtype> & hashes, bool /*testColl*/, bool testDist, bool drawDiagram )
252{
253 std::vector<hashtype> collisions;
254
255 return TestHashList(hashes,collisions,testDist,drawDiagram);
256}
257
258//-----------------------------------------------------------------------------
259
260template < class keytype, typename hashtype >
261bool TestKeyList ( hashfunc<hashtype> hash, std::vector<keytype> & keys, bool testColl, bool testDist, bool drawDiagram )
262{
263 int keycount = (int)keys.size();
264
265 std::vector<hashtype> hashes;
266
267 hashes.resize(keycount);
268
269 printf("Hashing");
270
271 for(int i = 0; i < keycount; i++)
272 {
273 if(i % (keycount / 10) == 0) printf(".");
274
275 keytype & k = keys[i];
276
277 hash(&k,sizeof(k),0,&hashes[i]);
278 }
279
280 printf("\n");
281
282 bool result = TestHashList(hashes,testColl,testDist,drawDiagram);
283
284 printf("\n");
285
286 return result;
287}
288
289//-----------------------------------------------------------------------------
290// Bytepair test - generate 16-bit indices from all possible non-overlapping
291// 8-bit sections of the hash value, check distribution on all of them.
292
293// This is a very good test for catching weak intercorrelations between bits -
294// much harder to pass than the normal distribution test. However, it doesn't
295// really model the normal usage of hash functions in hash table lookup, so
296// I'm not sure it's that useful (and hash functions that fail this test but
297// pass the normal distribution test still work well in practice)
298
299template < typename hashtype >
300double TestDistributionBytepairs ( std::vector<hashtype> & hashes, bool drawDiagram )
301{
302 const int nbytes = sizeof(hashtype);
303 const int hashbits = nbytes * 8;
304
305 const int nbins = 65536;
306
307 std::vector<int> bins(nbins,0);
308
309 double worst = 0;
310
311 for(int a = 0; a < hashbits; a++)
312 {
313 if(drawDiagram) if((a % 8 == 0) && (a > 0)) printf("\n");
314
315 if(drawDiagram) printf("[");
316
317 for(int b = 0; b < hashbits; b++)
318 {
319 if(drawDiagram) if((b % 8 == 0) && (b > 0)) printf(" ");
320
321 bins.clear();
322 bins.resize(nbins,0);
323
324 for(size_t i = 0; i < hashes.size(); i++)
325 {
326 hashtype & hash = hashes[i];
327
328 uint32_t pa = window(&hash,sizeof(hash),a,8);
329 uint32_t pb = window(&hash,sizeof(hash),b,8);
330
331 bins[pa | (pb << 8)]++;
332 }
333
334 double s = calcScore(bins,bins.size(),hashes.size());
335
336 if(drawDiagram) plot(s);
337
338 if(s > worst)
339 {
340 worst = s;
341 }
342 }
343
344 if(drawDiagram) printf("]\n");
345 }
346
347 return worst;
348}
349
350//-----------------------------------------------------------------------------
351// Simplified test - only check 64k distributions, and only on byte boundaries
352
353template < typename hashtype >
354void TestDistributionFast ( std::vector<hashtype> & hashes, double & dworst, double & davg )
355{
356 const int hashbits = sizeof(hashtype) * 8;
357 const int nbins = 65536;
358
359 std::vector<int> bins(nbins,0);
360
361 dworst = -1.0e90;
362 davg = 0;
363
364 for(int start = 0; start < hashbits; start += 8)
365 {
366 bins.clear();
367 bins.resize(nbins,0);
368
369 for(size_t j = 0; j < hashes.size(); j++)
370 {
371 hashtype & hash = hashes[j];
372
373 uint32_t index = window(&hash,sizeof(hash),start,16);
374
375 bins[index]++;
376 }
377
378 double n = calcScore(&bins.front(),(int)bins.size(),(int)hashes.size());
379
380 davg += n;
381
382 if(n > dworst) dworst = n;
383 }
384
385 davg /= double(hashbits/8);
386}
387
388//-----------------------------------------------------------------------------