clock: turn expensive division into multiply + cheap division
On x86-64, dividing by a variable turns into a hugely expensive
divq. It's much cheaper to invert the division. Instead of
dividing clocks by clocks-per-usec, multiply by a 16M/clocks-per-usec
constant instead.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
diff --git a/gettime.c b/gettime.c
index 035d275..df329f6 100644
--- a/gettime.c
+++ b/gettime.c
@@ -15,6 +15,7 @@
#ifdef ARCH_HAVE_CPU_CLOCK
static unsigned long cycles_per_usec;
+static unsigned long inv_cycles_per_usec;
int tsc_reliable = 0;
#endif
@@ -177,7 +178,7 @@
} else if (tv)
tv->last_cycles = t;
- usecs = t / cycles_per_usec;
+ usecs = (t * inv_cycles_per_usec) / 16777216UL;
tp->tv_sec = usecs / 1000000;
tp->tv_usec = usecs % 1000000;
break;
@@ -277,6 +278,8 @@
dprint(FD_TIME, "mean=%f, S=%f\n", mean, S);
cycles_per_usec = avg;
+ inv_cycles_per_usec = 16777216UL / cycles_per_usec;
+ dprint(FD_TIME, "inv_cycles_per_usec=%lu\n", inv_cycles_per_usec);
}
#else
static void calibrate_cpu_clock(void)