clock: turn expensive division into multiply + cheap division

On x86-64, dividing by a variable turns into a hugely expensive
divq. It's much cheaper to invert the division. Instead of
dividing clocks by clocks-per-usec, multiply by a 16M/clocks-per-usec
constant instead.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
diff --git a/gettime.c b/gettime.c
index 035d275..df329f6 100644
--- a/gettime.c
+++ b/gettime.c
@@ -15,6 +15,7 @@
 
 #ifdef ARCH_HAVE_CPU_CLOCK
 static unsigned long cycles_per_usec;
+static unsigned long inv_cycles_per_usec;
 int tsc_reliable = 0;
 #endif
 
@@ -177,7 +178,7 @@
 		} else if (tv)
 			tv->last_cycles = t;
 
-		usecs = t / cycles_per_usec;
+		usecs = (t * inv_cycles_per_usec) / 16777216UL;
 		tp->tv_sec = usecs / 1000000;
 		tp->tv_usec = usecs % 1000000;
 		break;
@@ -277,6 +278,8 @@
 	dprint(FD_TIME, "mean=%f, S=%f\n", mean, S);
 
 	cycles_per_usec = avg;
+	inv_cycles_per_usec = 16777216UL / cycles_per_usec;
+	dprint(FD_TIME, "inv_cycles_per_usec=%lu\n", inv_cycles_per_usec);
 }
 #else
 static void calibrate_cpu_clock(void)