njn | ec0c27a | 2005-12-10 23:11:28 +0000 | [diff] [blame] | 1 | // This small program computes a Fast Fourier Transform. It tests |
| 2 | // Valgrind's handling of FP operations. It is representative of all |
| 3 | // programs that do a lot of FP operations. |
| 4 | |
njn | 86c2349 | 2005-12-13 04:06:29 +0000 | [diff] [blame] | 5 | // Licensing: This program is closely based on the one of the same name from |
| 6 | // http://www.fourmilab.ch/. The front page of that site says: |
njn | ec0c27a | 2005-12-10 23:11:28 +0000 | [diff] [blame] | 7 | // |
| 8 | // "Except for a few clearly-marked exceptions, all the material on this |
| 9 | // site is in the public domain and may be used in any manner without |
| 10 | // permission, restriction, attribution, or compensation." |
| 11 | |
| 12 | /* |
| 13 | |
| 14 | Two-dimensional FFT benchmark |
| 15 | |
| 16 | Designed and implemented by John Walker in April of 1989. |
| 17 | |
| 18 | This benchmark executes a specified number of passes (default |
| 19 | 20) through a loop in which each iteration performs a fast |
| 20 | Fourier transform of a square matrix (default size 256x256) of |
| 21 | complex numbers (default precision double), followed by the |
| 22 | inverse transform. After all loop iterations are performed |
| 23 | the results are checked against known correct values. |
| 24 | |
| 25 | This benchmark is intended for use on C implementations which |
| 26 | define "int" as 32 bits or longer and permit allocation and |
| 27 | direct addressing of arrays larger than one megabyte. |
| 28 | |
| 29 | If CAPOUT is defined, the result after all iterations is |
| 30 | written as a CA Lab pattern file. This is intended for |
| 31 | debugging in case horribly wrong results are obtained on a |
| 32 | given machine. |
| 33 | |
| 34 | Archival timings are run with the definitions below set as |
| 35 | follows: Float = double, Asize = 256, Passes = 20, CAPOUT not |
| 36 | defined. |
| 37 | |
| 38 | Time (seconds) System |
| 39 | |
| 40 | 2393.93 Sun 3/260, SunOS 3.4, C, "-f68881 -O". |
| 41 | (John Walker). |
| 42 | |
| 43 | 1928 Macintosh IIx, MPW C 3.0, "-mc68020 |
| 44 | -mc68881 -elems881 -m". (Hugh Hoover). |
| 45 | |
| 46 | 1636.1 Sun 4/110, "cc -O3 -lm". (Michael McClary). |
| 47 | The suspicion is that this is software |
| 48 | floating point. |
| 49 | |
| 50 | 1556.7 Macintosh II, A/UX, "cc -O -lm" |
| 51 | (Michael McClary). |
| 52 | |
| 53 | 1388.8 Sun 386i/250, SunOS 4.0.1 C |
| 54 | "-O /usr/lib/trig.il". (James Carrington). |
| 55 | |
| 56 | 1331.93 Sun 3/60, SunOS 4.0.1, C, |
| 57 | "-O4 -f68881 /usr/lib/libm.il" |
| 58 | (Bob Elman). |
| 59 | |
| 60 | 1204.0 Apollo Domain DN4000, C, "-cpu 3000 -opt 4". |
| 61 | (Sam Crupi). |
| 62 | |
| 63 | 1174.66 Compaq 386/25, SCO Xenix 386 C. |
| 64 | (Peter Shieh). |
| 65 | |
| 66 | 1068 Compaq 386/25, SCO Xenix 386, |
| 67 | Metaware High C. (Robert Wenig). |
| 68 | |
| 69 | 1064.0 Sun 3/80, SunOS 4.0.3 Beta C |
| 70 | "-O3 -f68881 /usr/lib/libm.il". (James Carrington). |
| 71 | |
| 72 | 1061.4 Compaq 386/25, SCO Xenix, High C 1.4. |
| 73 | (James Carrington). |
| 74 | |
| 75 | 1059.79 Compaq 386/25, 387/25, High C 1.4, |
| 76 | DOS|Extender 2.2, 387 inline code |
| 77 | generation. (Nathan Bender). |
| 78 | |
| 79 | 777.14 Compaq 386/25, IIT 3C87-25 (387 Compatible), |
| 80 | High C 1.5, DOS|Extender 2.2, 387 inline |
| 81 | code generation. (Nathan Bender). |
| 82 | |
| 83 | 751 Compaq DeskPro 386/33, High C 1.5 + DOS|Extender, |
| 84 | 387 code generation. (James Carrington). |
| 85 | |
| 86 | 431.44 Compaq 386/25, Weitek 3167-25, DOS 3.31, |
| 87 | High C 1.4, DOS|Extender, Weitek code generation. |
| 88 | (Nathan Bender). |
| 89 | |
| 90 | 344.9 Compaq 486/25, Metaware High C 1.6, Phar Lap |
| 91 | DOS|Extender, in-line floating point. (Nathan |
| 92 | Bender). |
| 93 | |
| 94 | 324.2 Data General Motorola 88000, 16 Mhz, Gnu C. |
| 95 | |
| 96 | 323.1 Sun 4/280, C, "-O4". (Eric Hill). |
| 97 | |
| 98 | 254 Compaq SystemPro 486/33, High C 1.5 + DOS|Extender, |
| 99 | 387 code generation. (James Carrington). |
| 100 | |
| 101 | 242.8 Silicon Graphics Personal IRIS, MIPS R2000A, |
| 102 | 12.5 Mhz, "-O3" (highest level optimisation). |
| 103 | (Mike Zentner). |
| 104 | |
| 105 | 233.0 Sun SPARCStation 1, C, "-O4", SunOS 4.0.3. |
| 106 | (Nathan Bender). |
| 107 | |
| 108 | 187.30 DEC PMAX 3100, MIPS 2000 chip. |
| 109 | (Robert Wenig). |
| 110 | |
| 111 | 120.46 Sun SparcStation 2, C, "-O4", SunOS 4.1.1. |
| 112 | (John Walker). |
| 113 | |
| 114 | 120.21 DEC 3MAX, MIPS 3000, "-O4". |
| 115 | |
| 116 | 98.0 Intel i860 experimental environment, |
| 117 | OS/2, data caching disabled. (Kern |
| 118 | Sibbald). |
| 119 | |
| 120 | 34.9 Silicon Graphics Indigo², MIPS R4400, |
| 121 | 175 Mhz, IRIX 5.2, "-O". |
| 122 | |
| 123 | 32.4 Pentium 133, Windows NT, Microsoft Visual |
| 124 | C++ 4.0. |
| 125 | |
| 126 | 17.25 Silicon Graphics Indigo², MIPS R4400, |
| 127 | 175 Mhz, IRIX 6.5, "-O3". |
| 128 | |
| 129 | 14.10 Dell Dimension XPS R100, Pentium II 400 MHz, |
| 130 | Windows 98, Microsoft Visual C 5.0. |
| 131 | |
| 132 | 10.7 Hewlett-Packard Kayak XU 450Mhz Pentium II, |
| 133 | Microsoft Visual C++ 6.0, Windows NT 4.0sp3. (Nathan Bender). |
| 134 | |
| 135 | 5.09 Sun Ultra 2, UltraSPARC V9, 300 MHz, gcc -O3. |
| 136 | |
| 137 | 0.846 Dell Inspiron 9100, Pentium 4, 3.4 GHz, gcc -O3. |
| 138 | |
| 139 | */ |
| 140 | |
| 141 | #include <stdio.h> |
| 142 | #include <stdlib.h> |
| 143 | #include <math.h> |
| 144 | #include <string.h> |
| 145 | |
| 146 | /* The program may be run with Float defined as either float or |
| 147 | double. With IEEE arithmetic, the same answers are generated for |
| 148 | either floating point mode. */ |
| 149 | |
| 150 | #define Float double /* Floating point type used in FFT */ |
| 151 | |
| 152 | #define Asize 256 /* Array edge size */ |
| 153 | #define Passes 20 /* Number of FFT/Inverse passes */ |
| 154 | |
| 155 | #define max(a,b) ((a)>(b)?(a):(b)) |
| 156 | #define min(a,b) ((a)<=(b)?(a):(b)) |
| 157 | |
njn | ec0c27a | 2005-12-10 23:11:28 +0000 | [diff] [blame] | 158 | /* |
| 159 | |
| 160 | Multi-dimensional fast Fourier transform |
| 161 | |
| 162 | Adapted from Press et al., "Numerical Recipes in C". |
| 163 | |
| 164 | */ |
| 165 | |
| 166 | #define SWAP(a,b) tempr=(a); (a)=(b); (b)=tempr |
| 167 | |
| 168 | static void fourn(data, nn, ndim, isign) |
| 169 | Float data[]; |
| 170 | int nn[], ndim, isign; |
| 171 | { |
| 172 | register int i1, i2, i3; |
| 173 | int i2rev, i3rev, ip1, ip2, ip3, ifp1, ifp2; |
| 174 | int ibit, idim, k1, k2, n, nprev, nrem, ntot; |
| 175 | Float tempi, tempr; |
| 176 | double theta, wi, wpi, wpr, wr, wtemp; |
| 177 | |
| 178 | ntot = 1; |
| 179 | for (idim = 1; idim <= ndim; idim++) |
| 180 | ntot *= nn[idim]; |
| 181 | nprev = 1; |
| 182 | for (idim = ndim; idim >= 1; idim--) { |
| 183 | n = nn[idim]; |
| 184 | nrem = ntot / (n * nprev); |
| 185 | ip1 = nprev << 1; |
| 186 | ip2 = ip1 * n; |
| 187 | ip3 = ip2 * nrem; |
| 188 | i2rev = 1; |
| 189 | for (i2 = 1; i2 <= ip2; i2 += ip1) { |
| 190 | if (i2 < i2rev) { |
| 191 | for (i1 = i2; i1 <= i2 + ip1 - 2; i1 += 2) { |
| 192 | for (i3 = i1; i3 <= ip3; i3 += ip2) { |
| 193 | i3rev = i2rev + i3 - i2; |
| 194 | SWAP(data[i3], data[i3rev]); |
| 195 | SWAP(data[i3 + 1], data[i3rev + 1]); |
| 196 | } |
| 197 | } |
| 198 | } |
| 199 | ibit = ip2 >> 1; |
| 200 | while (ibit >= ip1 && i2rev > ibit) { |
| 201 | i2rev -= ibit; |
| 202 | ibit >>= 1; |
| 203 | } |
| 204 | i2rev += ibit; |
| 205 | } |
| 206 | ifp1 = ip1; |
| 207 | while (ifp1 < ip2) { |
| 208 | ifp2 = ifp1 << 1; |
| 209 | theta = isign * 6.28318530717959 / (ifp2 / ip1); |
| 210 | wtemp = sin(0.5 * theta); |
| 211 | wpr = -2.0 * wtemp * wtemp; |
| 212 | wpi = sin(theta); |
| 213 | wr = 1.0; |
| 214 | wi = 0.0; |
| 215 | for (i3 = 1; i3 <= ifp1; i3 += ip1) { |
| 216 | for (i1 = i3; i1 <= i3 + ip1 - 2; i1 += 2) { |
| 217 | for (i2 = i1; i2 <= ip3; i2 += ifp2) { |
| 218 | k1 = i2; |
| 219 | k2 = k1 + ifp1; |
| 220 | tempr = wr * data[k2] - wi * data[k2 + 1]; |
| 221 | tempi = wr * data[k2 + 1] + wi * data[k2]; |
| 222 | data[k2] = data[k1] - tempr; |
| 223 | data[k2 + 1] = data[k1 + 1] - tempi; |
| 224 | data[k1] += tempr; |
| 225 | data[k1 + 1] += tempi; |
| 226 | } |
| 227 | } |
| 228 | wr = (wtemp = wr) * wpr - wi * wpi + wr; |
| 229 | wi = wi * wpr + wtemp * wpi + wi; |
| 230 | } |
| 231 | ifp1 = ifp2; |
| 232 | } |
| 233 | nprev *= n; |
| 234 | } |
| 235 | } |
| 236 | #undef SWAP |
| 237 | |
| 238 | int main() |
| 239 | { |
| 240 | int i, j, k, l, m, npasses = Passes, faedge; |
| 241 | Float *fdata /* , *fd */ ; |
| 242 | static int nsize[] = {0, 0, 0}; |
| 243 | long fanum, fasize; |
| 244 | double mapbase, mapscale, /* x, */ rmin, rmax, imin, imax; |
| 245 | |
| 246 | faedge = Asize; /* FFT array edge size */ |
| 247 | fanum = faedge * faedge; /* Elements in FFT array */ |
| 248 | fasize = ((fanum + 1) * 2 * sizeof(Float)); /* FFT array size */ |
| 249 | nsize[1] = nsize[2] = faedge; |
| 250 | |
| 251 | fdata = (Float *) malloc(fasize); |
| 252 | if (fdata == NULL) { |
| 253 | fprintf(stdout, "Can't allocate data array.\n"); |
| 254 | exit(1); |
| 255 | } |
| 256 | |
| 257 | /* Generate data array to process. */ |
| 258 | |
| 259 | #define Re(x,y) fdata[1 + (faedge * (x) + (y)) * 2] |
| 260 | #define Im(x,y) fdata[2 + (faedge * (x) + (y)) * 2] |
| 261 | |
| 262 | memset(fdata, 0, fasize); |
| 263 | for (i = 0; i < faedge; i++) { |
| 264 | for (j = 0; j < faedge; j++) { |
| 265 | if (((i & 15) == 8) || ((j & 15) == 8)) |
| 266 | Re(i, j) = 128.0; |
| 267 | } |
| 268 | } |
| 269 | |
| 270 | for (i = 0; i < npasses; i++) { |
| 271 | /*printf("Pass %d\n", i);*/ |
| 272 | /* Transform image to frequency domain. */ |
| 273 | fourn(fdata, nsize, 2, 1); |
| 274 | |
| 275 | /* Back-transform to image. */ |
| 276 | fourn(fdata, nsize, 2, -1); |
| 277 | } |
| 278 | |
| 279 | { |
| 280 | double r, ij, ar, ai; |
| 281 | rmin = 1e10; rmax = -1e10; |
| 282 | imin = 1e10; imax = -1e10; |
| 283 | ar = 0; |
| 284 | ai = 0; |
| 285 | |
| 286 | for (i = 1; i <= fanum; i += 2) { |
| 287 | r = fdata[i]; |
| 288 | ij = fdata[i + 1]; |
| 289 | ar += r; |
| 290 | ai += ij; |
| 291 | rmin = min(r, rmin); |
| 292 | rmax = max(r, rmax); |
| 293 | imin = min(ij, imin); |
| 294 | imax = max(ij, imax); |
| 295 | } |
| 296 | #ifdef DEBUG |
| 297 | printf("Real min %.4g, max %.4g. Imaginary min %.4g, max %.4g.\n", |
| 298 | rmin, rmax, imin, imax); |
| 299 | printf("Average real %.4g, imaginary %.4g.\n", |
| 300 | ar / fanum, ai / fanum); |
| 301 | #endif |
| 302 | mapbase = rmin; |
| 303 | mapscale = 255 / (rmax - rmin); |
| 304 | } |
| 305 | |
| 306 | /* See if we got the right answers. */ |
| 307 | |
| 308 | m = 0; |
| 309 | for (i = 0; i < faedge; i++) { |
| 310 | for (j = 0; j < faedge; j++) { |
| 311 | k = (Re(i, j) - mapbase) * mapscale; |
| 312 | l = (((i & 15) == 8) || ((j & 15) == 8)) ? 255 : 0; |
| 313 | if (k != l) { |
| 314 | m++; |
| 315 | fprintf(stdout, |
| 316 | "Wrong answer at (%d,%d)! Expected %d, got %d.\n", |
| 317 | i, j, l, k); |
| 318 | } |
| 319 | } |
| 320 | } |
| 321 | if (m == 0) { |
| 322 | fprintf(stdout, "%d passes. No errors in results.\n", npasses); |
| 323 | } else { |
| 324 | fprintf(stdout, "%d passes. %d errors in results.\n", |
| 325 | npasses, m); |
| 326 | } |
| 327 | |
| 328 | #ifdef CAPOUT |
| 329 | |
| 330 | /* Output the result of the transform as a CA Lab pattern |
| 331 | file for debugging. */ |
| 332 | |
| 333 | { |
| 334 | #define SCRX 322 |
| 335 | #define SCRY 200 |
| 336 | #define SCRN (SCRX * SCRY) |
| 337 | unsigned char patarr[SCRY][SCRX]; |
| 338 | FILE *fp; |
| 339 | |
| 340 | /* Map user external state numbers to internal state index */ |
| 341 | |
| 342 | #define UtoI(x) (((((x) >> 1) & 0x7F) | ((x) << 7)) & 0xFF) |
| 343 | |
| 344 | /* Copy data from FFT buffer to map. */ |
| 345 | |
| 346 | memset(patarr, 0, sizeof patarr); |
| 347 | l = (SCRX - faedge) / 2; |
| 348 | m = (faedge > SCRY) ? 0 : ((SCRY - faedge) / 2); |
| 349 | for (i = 1; i < faedge; i++) { |
| 350 | for (j = 0; j < min(SCRY, faedge); j++) { |
| 351 | k = (Re(i, j) - mapbase) * mapscale; |
| 352 | patarr[j + m][i + l] = UtoI(k); |
| 353 | } |
| 354 | } |
| 355 | |
| 356 | /* Dump pattern map to file. */ |
| 357 | |
| 358 | fp = fopen("fft.cap", "w"); |
| 359 | if (fp == NULL) { |
| 360 | fprintf(stdout, "Cannot open output file.\n"); |
| 361 | exit(0); |
| 362 | } |
| 363 | putc(':', fp); |
| 364 | putc(1, fp); |
| 365 | fwrite(patarr, SCRN, 1, fp); |
| 366 | putc(6, fp); |
| 367 | fclose(fp); |
| 368 | } |
| 369 | #endif |
| 370 | |
| 371 | return 0; |
| 372 | } |