tanjent@gmail.com | f3b7897 | 2012-03-01 03:38:55 +0000 | [diff] [blame] | 1 | #include "SpeedTest.h" |
| 2 | |
| 3 | #include "Random.h" |
| 4 | |
| 5 | #include <stdio.h> // for printf |
| 6 | #include <memory.h> // for memset |
| 7 | #include <math.h> // for sqrt |
| 8 | #include <algorithm> // for sort |
| 9 | |
| 10 | //----------------------------------------------------------------------------- |
| 11 | // We view our timing values as a series of random variables V that has been |
| 12 | // contaminated with occasional outliers due to cache misses, thread |
| 13 | // preemption, etcetera. To filter out the outliers, we search for the largest |
| 14 | // subset of V such that all its values are within three standard deviations |
| 15 | // of the mean. |
| 16 | |
| 17 | double CalcMean ( std::vector<double> & v ) |
| 18 | { |
| 19 | double mean = 0; |
| 20 | |
| 21 | for(int i = 0; i < (int)v.size(); i++) |
| 22 | { |
| 23 | mean += v[i]; |
| 24 | } |
| 25 | |
| 26 | mean /= double(v.size()); |
| 27 | |
| 28 | return mean; |
| 29 | } |
| 30 | |
| 31 | double CalcMean ( std::vector<double> & v, int a, int b ) |
| 32 | { |
| 33 | double mean = 0; |
| 34 | |
| 35 | for(int i = a; i <= b; i++) |
| 36 | { |
| 37 | mean += v[i]; |
| 38 | } |
| 39 | |
| 40 | mean /= (b-a+1); |
| 41 | |
| 42 | return mean; |
| 43 | } |
| 44 | |
| 45 | double CalcStdv ( std::vector<double> & v, int a, int b ) |
| 46 | { |
| 47 | double mean = CalcMean(v,a,b); |
| 48 | |
| 49 | double stdv = 0; |
| 50 | |
| 51 | for(int i = a; i <= b; i++) |
| 52 | { |
| 53 | double x = v[i] - mean; |
| 54 | |
| 55 | stdv += x*x; |
| 56 | } |
| 57 | |
| 58 | stdv = sqrt(stdv / (b-a+1)); |
| 59 | |
| 60 | return stdv; |
| 61 | } |
| 62 | |
| 63 | // Return true if the largest value in v[0,len) is more than three |
| 64 | // standard deviations from the mean |
| 65 | |
| 66 | bool ContainsOutlier ( std::vector<double> & v, size_t len ) |
| 67 | { |
| 68 | double mean = 0; |
| 69 | |
| 70 | for(size_t i = 0; i < len; i++) |
| 71 | { |
| 72 | mean += v[i]; |
| 73 | } |
| 74 | |
| 75 | mean /= double(len); |
| 76 | |
| 77 | double stdv = 0; |
| 78 | |
| 79 | for(size_t i = 0; i < len; i++) |
| 80 | { |
| 81 | double x = v[i] - mean; |
| 82 | stdv += x*x; |
| 83 | } |
| 84 | |
| 85 | stdv = sqrt(stdv / double(len)); |
| 86 | |
| 87 | double cutoff = mean + stdv*3; |
| 88 | |
| 89 | return v[len-1] > cutoff; |
| 90 | } |
| 91 | |
| 92 | // Do a binary search to find the largest subset of v that does not contain |
| 93 | // outliers. |
| 94 | |
| 95 | void FilterOutliers ( std::vector<double> & v ) |
| 96 | { |
| 97 | std::sort(v.begin(),v.end()); |
| 98 | |
| 99 | size_t len = 0; |
| 100 | |
| 101 | for(size_t x = 0x40000000; x; x = x >> 1 ) |
| 102 | { |
| 103 | if((len | x) >= v.size()) continue; |
| 104 | |
| 105 | if(!ContainsOutlier(v,len | x)) |
| 106 | { |
| 107 | len |= x; |
| 108 | } |
| 109 | } |
| 110 | |
| 111 | v.resize(len); |
| 112 | } |
| 113 | |
| 114 | // Iteratively tighten the set to find a subset that does not contain |
| 115 | // outliers. I'm not positive this works correctly in all cases. |
| 116 | |
| 117 | void FilterOutliers2 ( std::vector<double> & v ) |
| 118 | { |
| 119 | std::sort(v.begin(),v.end()); |
| 120 | |
| 121 | int a = 0; |
| 122 | int b = (int)(v.size() - 1); |
| 123 | |
| 124 | for(int i = 0; i < 10; i++) |
| 125 | { |
| 126 | //printf("%d %d\n",a,b); |
| 127 | |
| 128 | double mean = CalcMean(v,a,b); |
| 129 | double stdv = CalcStdv(v,a,b); |
| 130 | |
| 131 | double cutA = mean - stdv*3; |
| 132 | double cutB = mean + stdv*3; |
| 133 | |
| 134 | while((a < b) && (v[a] < cutA)) a++; |
| 135 | while((b > a) && (v[b] > cutB)) b--; |
| 136 | } |
| 137 | |
| 138 | std::vector<double> v2; |
| 139 | |
| 140 | v2.insert(v2.begin(),v.begin()+a,v.begin()+b+1); |
| 141 | |
| 142 | v.swap(v2); |
| 143 | } |
| 144 | |
| 145 | //----------------------------------------------------------------------------- |
| 146 | // We really want the rdtsc() calls to bracket the function call as tightly |
| 147 | // as possible, but that's hard to do portably. We'll try and get as close as |
| 148 | // possible by marking the function as NEVER_INLINE (to keep the optimizer from |
| 149 | // moving it) and marking the timing variables as "volatile register". |
| 150 | |
| 151 | NEVER_INLINE int64_t timehash ( pfHash hash, const void * key, int len, int seed ) |
| 152 | { |
| 153 | volatile register int64_t begin,end; |
| 154 | |
| 155 | uint32_t temp[16]; |
| 156 | |
| 157 | begin = rdtsc(); |
| 158 | |
| 159 | hash(key,len,seed,temp); |
| 160 | |
| 161 | end = rdtsc(); |
| 162 | |
| 163 | return end-begin; |
| 164 | } |
| 165 | |
| 166 | //----------------------------------------------------------------------------- |
| 167 | |
| 168 | double SpeedTest ( pfHash hash, uint32_t seed, const int trials, const int blocksize, const int align ) |
| 169 | { |
| 170 | Rand r(seed); |
| 171 | |
| 172 | uint8_t * buf = new uint8_t[blocksize + 512]; |
| 173 | |
| 174 | uint64_t t1 = reinterpret_cast<uint64_t>(buf); |
| 175 | |
| 176 | t1 = (t1 + 255) & BIG_CONSTANT(0xFFFFFFFFFFFFFF00); |
| 177 | t1 += align; |
| 178 | |
| 179 | uint8_t * block = reinterpret_cast<uint8_t*>(t1); |
| 180 | |
| 181 | r.rand_p(block,blocksize); |
| 182 | |
| 183 | //---------- |
| 184 | |
| 185 | std::vector<double> times; |
| 186 | times.reserve(trials); |
| 187 | |
| 188 | for(int itrial = 0; itrial < trials; itrial++) |
| 189 | { |
| 190 | r.rand_p(block,blocksize); |
| 191 | |
| 192 | double t = (double)timehash(hash,block,blocksize,itrial); |
| 193 | |
| 194 | if(t > 0) times.push_back(t); |
| 195 | } |
| 196 | |
| 197 | //---------- |
| 198 | |
| 199 | std::sort(times.begin(),times.end()); |
| 200 | |
| 201 | FilterOutliers(times); |
| 202 | |
| 203 | delete [] buf; |
| 204 | |
| 205 | return CalcMean(times); |
| 206 | } |
| 207 | |
| 208 | //----------------------------------------------------------------------------- |
| 209 | // 256k blocks seem to give the best results. |
| 210 | |
| 211 | void BulkSpeedTest ( pfHash hash, uint32_t seed ) |
| 212 | { |
| 213 | const int trials = 2999; |
| 214 | const int blocksize = 256 * 1024; |
| 215 | |
| 216 | printf("Bulk speed test - %d-byte keys\n",blocksize); |
| 217 | |
| 218 | for(int align = 0; align < 8; align++) |
| 219 | { |
| 220 | double cycles = SpeedTest(hash,seed,trials,blocksize,align); |
| 221 | |
| 222 | double bestbpc = double(blocksize)/cycles; |
| 223 | |
| 224 | double bestbps = (bestbpc * 3000000000.0 / 1048576.0); |
| 225 | printf("Alignment %2d - %6.3f bytes/cycle - %7.2f MiB/sec @ 3 ghz\n",align,bestbpc,bestbps); |
| 226 | } |
| 227 | } |
| 228 | |
| 229 | //----------------------------------------------------------------------------- |
| 230 | |
| 231 | void TinySpeedTest ( pfHash hash, int hashsize, int keysize, uint32_t seed, bool verbose, double & /*outCycles*/ ) |
| 232 | { |
| 233 | const int trials = 999999; |
| 234 | |
| 235 | if(verbose) printf("Small key speed test - %4d-byte keys - ",keysize); |
| 236 | |
| 237 | double cycles = SpeedTest(hash,seed,trials,keysize,0); |
| 238 | |
| 239 | printf("%8.2f cycles/hash\n",cycles); |
| 240 | } |
| 241 | |
| 242 | //----------------------------------------------------------------------------- |