Refactor microbenchmarks

- Report CPU frequency only if it is successfully queried

PiperOrigin-RevId: 345756568
diff --git a/bench/average-pooling.cc b/bench/average-pooling.cc
index a228900..d4720bd 100644
--- a/bench/average-pooling.cc
+++ b/bench/average-pooling.cc
@@ -95,7 +95,10 @@
   }
   pooling_op = nullptr;
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   state.counters["bytes"] = benchmark::Counter(
     uint64_t(state.iterations()) *
@@ -169,7 +172,10 @@
   }
   pooling_op = nullptr;
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   state.counters["bytes"] = benchmark::Counter(
     uint64_t(state.iterations()) *
@@ -304,7 +310,10 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   state.counters["bytes"] = benchmark::Counter(
     uint64_t(state.iterations()) *
diff --git a/bench/bankers-rounding.cc b/bench/bankers-rounding.cc
index 9925c27..09415cf 100644
--- a/bench/bankers-rounding.cc
+++ b/bench/bankers-rounding.cc
@@ -77,7 +77,10 @@
     return;
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = batch_size * channels;
   state.counters["elements"] =
@@ -184,7 +187,10 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = batch_size * channels;
   state.counters["elements"] =
diff --git a/bench/ceiling.cc b/bench/ceiling.cc
index 6cff9b1..6c2e164 100644
--- a/bench/ceiling.cc
+++ b/bench/ceiling.cc
@@ -77,7 +77,10 @@
     return;
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = batch_size * channels;
   state.counters["elements"] =
@@ -184,7 +187,10 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = batch_size * channels;
   state.counters["elements"] =
diff --git a/bench/channel-shuffle.cc b/bench/channel-shuffle.cc
index 97d5ab0..b45df59 100644
--- a/bench/channel-shuffle.cc
+++ b/bench/channel-shuffle.cc
@@ -73,7 +73,10 @@
     return;
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = batch_size * groups * group_channels;
   state.counters["elements"] =
@@ -138,7 +141,10 @@
     return;
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = batch_size * groups * group_channels;
   state.counters["elements"] =
diff --git a/bench/convolution.cc b/bench/convolution.cc
index b0629d7..4616f5f 100644
--- a/bench/convolution.cc
+++ b/bench/convolution.cc
@@ -143,7 +143,11 @@
     convolution_op = nullptr;
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["OPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 *
       batch_size * output_height * output_width *
@@ -259,7 +263,11 @@
     convolution_op = nullptr;
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["OPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 *
       batch_size * output_height * output_width *
@@ -375,7 +383,11 @@
     convolution_op = nullptr;
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 *
       batch_size * output_height * output_width *
@@ -486,7 +498,11 @@
     convolution_op = nullptr;
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 *
       batch_size * output_height * output_width *
@@ -693,7 +709,11 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 *
       batch_size * output_height * output_width *
@@ -1016,7 +1036,11 @@
   bias_tensor.allocator()->free();
   output_tensor.allocator()->free();
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 *
       batch_size * output_height * output_width *
diff --git a/bench/deconvolution.cc b/bench/deconvolution.cc
index 7ac9623..3d01df6 100644
--- a/bench/deconvolution.cc
+++ b/bench/deconvolution.cc
@@ -130,13 +130,17 @@
     deconvolution_op = nullptr;
   }
 
-    state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
-    state.counters["OPS"] = benchmark::Counter(
-    uint64_t(state.iterations()) * 2 *
-      batch_size * input_width * input_width *
-      groups * group_input_channels * group_output_channels *
-      kernel_height * kernel_width,
-    benchmark::Counter::kIsRate);
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
+  state.counters["OPS"] = benchmark::Counter(
+  uint64_t(state.iterations()) * 2 *
+    batch_size * input_width * input_width *
+    groups * group_input_channels * group_output_channels *
+    kernel_height * kernel_width,
+  benchmark::Counter::kIsRate);
 }
 #endif  // XNN_NO_QU8_OPERATORS
 
@@ -243,7 +247,11 @@
     deconvolution_op = nullptr;
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 *
       batch_size * input_width * input_width *
@@ -427,7 +435,11 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 *
       batch_size * input_width * input_width *
diff --git a/bench/end2end.cc b/bench/end2end.cc
index 5052bda..52c5436 100644
--- a/bench/end2end.cc
+++ b/bench/end2end.cc
@@ -45,7 +45,11 @@
       }
     }
   }
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 }
 
 static void FP32MobileNetV1(benchmark::State& state) {
diff --git a/bench/f16-dwconv.cc b/bench/f16-dwconv.cc
index 836da0b..e8649d5 100644
--- a/bench/f16-dwconv.cc
+++ b/bench/f16-dwconv.cc
@@ -143,12 +143,15 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
-  state.counters["FLOPS"] = benchmark::Counter(
-    uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
-    benchmark::Counter::kIsRate);
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
-  state.counters["BYTES"] = benchmark::Counter(
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size, benchmark::Counter::kIsRate);
+
+  state.counters["bytes"] = benchmark::Counter(
     uint64_t(state.iterations()) * (output_size + input_height * input_width + kernel_size + 1 /* bias */) * channels * sizeof(uint16_t),
     benchmark::Counter::kIsRate);
 }
@@ -156,62 +159,62 @@
 #if XNN_ARCH_ARM64
   static void f16_dwconv_8x25__neonfp16arith_acc2(benchmark::State& state, const char* net) {
     DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2, 8, 25,
-    benchmark::utils::CheckNEONFP16ARITH);
+      benchmark::utils::CheckNEONFP16ARITH);
   }
 
   static void f16_dwconv_8x25__neonfp16arith(benchmark::State& state, const char* net) {
     DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith, 8, 25,
-    benchmark::utils::CheckNEONFP16ARITH);
+      benchmark::utils::CheckNEONFP16ARITH);
   }
 
   static void f16_dwconv_8x4__neonfp16arith_acc2(benchmark::State& state, const char* net) {
     DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2, 8, 4,
-    benchmark::utils::CheckNEONFP16ARITH);
+      benchmark::utils::CheckNEONFP16ARITH);
   }
 
   static void f16_dwconv_8x4__neonfp16arith(benchmark::State& state, const char* net) {
     DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith, 8, 4,
-    benchmark::utils::CheckNEONFP16ARITH);
+      benchmark::utils::CheckNEONFP16ARITH);
   }
 
   static void f16_dwconv_8x9__neonfp16arith_acc2(benchmark::State& state, const char* net) {
     DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2, 8, 9,
-    benchmark::utils::CheckNEONFP16ARITH);
+      benchmark::utils::CheckNEONFP16ARITH);
   }
 
   static void f16_dwconv_8x9__neonfp16arith(benchmark::State& state, const char* net) {
     DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith, 8, 9,
-    benchmark::utils::CheckNEONFP16ARITH);
+      benchmark::utils::CheckNEONFP16ARITH);
   }
 
   static void f16_dwconv_16x25__neonfp16arith_acc2(benchmark::State& state, const char* net) {
     DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2, 16, 25,
-    benchmark::utils::CheckNEONFP16ARITH);
+      benchmark::utils::CheckNEONFP16ARITH);
   }
 
   static void f16_dwconv_16x25__neonfp16arith(benchmark::State& state, const char* net) {
     DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith, 16, 25,
-    benchmark::utils::CheckNEONFP16ARITH);
+      benchmark::utils::CheckNEONFP16ARITH);
   }
 
   static void f16_dwconv_16x4__neonfp16arith_acc2(benchmark::State& state, const char* net) {
     DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2, 16, 4,
-    benchmark::utils::CheckNEONFP16ARITH);
+      benchmark::utils::CheckNEONFP16ARITH);
   }
 
   static void f16_dwconv_16x4__neonfp16arith(benchmark::State& state, const char* net) {
     DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith, 16, 4,
-    benchmark::utils::CheckNEONFP16ARITH);
+      benchmark::utils::CheckNEONFP16ARITH);
   }
 
   static void f16_dwconv_16x9__neonfp16arith_acc2(benchmark::State& state, const char* net) {
     DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2, 16, 9,
-    benchmark::utils::CheckNEONFP16ARITH);
+      benchmark::utils::CheckNEONFP16ARITH);
   }
 
   static void f16_dwconv_16x9__neonfp16arith(benchmark::State& state, const char* net) {
     DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith, 16, 9,
-    benchmark::utils::CheckNEONFP16ARITH);
+      benchmark::utils::CheckNEONFP16ARITH);
   }
 
   BENCHMARK_DWCONV(f16_dwconv_8x25__neonfp16arith_acc2)
diff --git a/bench/f16-gemm.cc b/bench/f16-gemm.cc
index 738883d..07f0f1d 100644
--- a/bench/f16-gemm.cc
+++ b/bench/f16-gemm.cc
@@ -100,7 +100,11 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
 }
diff --git a/bench/f16-igemm.cc b/bench/f16-igemm.cc
index e0ea555..51a42ab 100644
--- a/bench/f16-igemm.cc
+++ b/bench/f16-igemm.cc
@@ -150,7 +150,11 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 *
       output_height * output_width *
diff --git a/bench/f16-relu.cc b/bench/f16-relu.cc
index 404d57c..8a14e7b 100644
--- a/bench/f16-relu.cc
+++ b/bench/f16-relu.cc
@@ -25,37 +25,38 @@
   xnn_f16_relu_ukernel_function f16_relu,
   benchmark::utils::IsaCheckFunction isa_check = nullptr)
 {
-  if (!cpuinfo_initialize()) {
-    state.SkipWithError("cpuinfo initialization failed");
-    return;
-  }
   if (isa_check && !isa_check(state)) {
     return;
   }
 
-  const size_t n = state.range(0);
+  const size_t elements = state.range(0);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
   auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
 
-  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(n);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(elements);
   std::generate(x.begin(), x.end(), std::ref(f16rng));
-  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(n);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(elements);
   std::generate(x.begin(), x.end(), std::ref(f16rng));
 
   for (auto _ : state) {
-    f16_relu(n * sizeof(uint16_t), x.data(), y.data(), NULL);
+    f16_relu(elements * sizeof(uint16_t), x.data(), y.data(), NULL);
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
+  const size_t elements_per_iteration = elements;
   state.counters["elements"] =
-    benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
 
+  const size_t bytes_per_iteration = 2 * elements * sizeof(uint16_t);
   state.counters["bytes"] =
-    benchmark::Counter(uint64_t(state.iterations()) * n * sizeof(uint16_t), benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
 }
 
 #if XNN_ARCH_ARM64
diff --git a/bench/f16-spmm.cc b/bench/f16-spmm.cc
index a19f736..982e26d 100644
--- a/bench/f16-spmm.cc
+++ b/bench/f16-spmm.cc
@@ -155,7 +155,11 @@
       &params);
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 * mc * num_nonzeroes, benchmark::Counter::kIsRate);
 
diff --git a/bench/f32-conv-hwc.cc b/bench/f32-conv-hwc.cc
index df2f4e8..2b5cee6 100644
--- a/bench/f32-conv-hwc.cc
+++ b/bench/f32-conv-hwc.cc
@@ -100,7 +100,11 @@
       &params);
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 *
       output_height * output_width *
diff --git a/bench/f32-conv-hwc2chw.cc b/bench/f32-conv-hwc2chw.cc
index 301dc91..35fe576 100644
--- a/bench/f32-conv-hwc2chw.cc
+++ b/bench/f32-conv-hwc2chw.cc
@@ -100,7 +100,11 @@
       &params);
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 *
       output_height * output_width *
diff --git a/bench/f32-dwconv-e2e.cc b/bench/f32-dwconv-e2e.cc
index 367f908..2cb171f 100644
--- a/bench/f32-dwconv-e2e.cc
+++ b/bench/f32-dwconv-e2e.cc
@@ -63,7 +63,11 @@
       }
     }
   }
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 }
 
 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
diff --git a/bench/f32-dwconv.cc b/bench/f32-dwconv.cc
index 648cf34..990513d 100644
--- a/bench/f32-dwconv.cc
+++ b/bench/f32-dwconv.cc
@@ -141,12 +141,16 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
     benchmark::Counter::kIsRate);
 
-  state.counters["BYTES"] = benchmark::Counter(
+  state.counters["bytes"] = benchmark::Counter(
     uint64_t(state.iterations()) * (output_size + input_height * input_width + kernel_size + 1 /* bias */) * channels * sizeof(float),
     benchmark::Counter::kIsRate);
 }
diff --git a/bench/f32-dwconv2d-chw.cc b/bench/f32-dwconv2d-chw.cc
index 51f23e7..1160429 100644
--- a/bench/f32-dwconv2d-chw.cc
+++ b/bench/f32-dwconv2d-chw.cc
@@ -137,12 +137,16 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
     benchmark::Counter::kIsRate);
 
-  state.counters["BYTES"] = benchmark::Counter(
+  state.counters["bytes"] = benchmark::Counter(
     uint64_t(state.iterations()) * (output_size + inputSize + kernel_size + 1 /* bias */) * channels * sizeof(float),
     benchmark::Counter::kIsRate);
 }
diff --git a/bench/f32-gemm-e2e.cc b/bench/f32-gemm-e2e.cc
index 523ac34..a5e2020 100644
--- a/bench/f32-gemm-e2e.cc
+++ b/bench/f32-gemm-e2e.cc
@@ -65,7 +65,11 @@
       }
     }
   }
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 }
 
 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
diff --git a/bench/f32-gemm.cc b/bench/f32-gemm.cc
index 9ccbe4b..dbe3eb7 100644
--- a/bench/f32-gemm.cc
+++ b/bench/f32-gemm.cc
@@ -101,7 +101,11 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
 }
@@ -177,7 +181,11 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
 }
@@ -257,7 +265,11 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
 }
@@ -335,7 +347,11 @@
     ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
 }
diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc
index 2a961fa..a14cb43 100644
--- a/bench/f32-igemm.cc
+++ b/bench/f32-igemm.cc
@@ -145,7 +145,11 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 *
       output_height * output_width *
diff --git a/bench/f32-im2col-gemm.cc b/bench/f32-im2col-gemm.cc
index 4843925..e5b3354 100644
--- a/bench/f32-im2col-gemm.cc
+++ b/bench/f32-im2col-gemm.cc
@@ -124,7 +124,11 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 *
       output_height * output_width *
diff --git a/bench/f32-raddexpminusmax.cc b/bench/f32-raddexpminusmax.cc
index f7ef890..50d4633 100644
--- a/bench/f32-raddexpminusmax.cc
+++ b/bench/f32-raddexpminusmax.cc
@@ -28,17 +28,17 @@
     return;
   }
 
-  const size_t n = state.range(0);
+  const size_t elements = state.range(0);
   const size_t cache_line_size_max = 128;
-  const size_t packed_n = benchmark::utils::RoundUp(n, cache_line_size_max / sizeof(float));
+  const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
 
   const size_t num_buffers = 1 +
-    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_n * sizeof(float));
-  std::vector<float, AlignedAllocator<float, 64>> x(n);
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
+  std::vector<float, AlignedAllocator<float, 64>> x(elements);
 
   std::generate(x.begin(), x.end(), std::ref(f32rng));
 
@@ -48,21 +48,28 @@
   for (auto _ : state) {
     state.PauseTiming();
     float x_max = nanf("");
-    rmax(n * sizeof(float), x.data(), &x_max);
+    rmax(elements * sizeof(float), x.data(), &x_max);
     if (++buffer_index == num_buffers) {
       buffer_index = 0;
     }
     state.ResumeTiming();
 
     float y_sum = nanf("");
-    raddexpminusmax(n * sizeof(float), x.data(), &y_sum, x_max);
+    raddexpminusmax(elements * sizeof(float), x.data(), &y_sum, x_max);
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
+  const size_t elements_per_iteration = elements;
   state.counters["elements"] =
-    benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+
+  const size_t bytes_per_iteration = 2 * elements * sizeof(float);
   state.counters["bytes"] =
-    benchmark::Counter(uint64_t(state.iterations()) * sizeof(float) * n, benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
 }
 
 static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
diff --git a/bench/f32-raddextexp.cc b/bench/f32-raddextexp.cc
index 85965e0..6196e01 100644
--- a/bench/f32-raddextexp.cc
+++ b/bench/f32-raddextexp.cc
@@ -26,17 +26,17 @@
     return;
   }
 
-  const size_t n = state.range(0);
+  const size_t elements = state.range(0);
   const size_t cache_line_size_max = 128;
-  const size_t packed_n = benchmark::utils::RoundUp(n, cache_line_size_max / sizeof(float));
+  const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
 
   const size_t num_buffers = 1 +
-    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_n * sizeof(float));
-  std::vector<float, AlignedAllocator<float, 64>> x(n);
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
+  std::vector<float, AlignedAllocator<float, 64>> x(elements);
 
   std::generate(x.begin(), x.end(), std::ref(f32rng));
 
@@ -49,14 +49,21 @@
     }
 
     float y_sum[2] = { nanf(""), nanf("") };
-    raddextexp(n * sizeof(float), x.data(), y_sum);
+    raddextexp(elements * sizeof(float), x.data(), y_sum);
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
+  const size_t elements_per_iteration = elements;
   state.counters["elements"] =
-    benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+
+  const size_t bytes_per_iteration = 2 * elements * sizeof(float);
   state.counters["bytes"] =
-    benchmark::Counter(uint64_t(state.iterations()) * sizeof(float) * n, benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
 }
 
 static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
diff --git a/bench/f32-raddstoreexpminusmax.cc b/bench/f32-raddstoreexpminusmax.cc
index 96da46f..c5fe76d 100644
--- a/bench/f32-raddstoreexpminusmax.cc
+++ b/bench/f32-raddstoreexpminusmax.cc
@@ -28,18 +28,18 @@
     return;
   }
 
-  const size_t n = state.range(0);
+  const size_t elements = state.range(0);
   const size_t cache_line_size_max = 128;
-  const size_t packed_n = benchmark::utils::RoundUp(n, cache_line_size_max / sizeof(float));
+  const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
 
   const size_t num_buffers = 1 +
-    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_n * sizeof(float));
-  std::vector<float, AlignedAllocator<float, 64>> x(n);
-  std::vector<float, AlignedAllocator<float, 64>> y(packed_n * num_buffers);
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
+  std::vector<float, AlignedAllocator<float, 64>> x(elements);
+  std::vector<float, AlignedAllocator<float, 64>> y(packed_elements * num_buffers);
 
   std::generate(x.begin(), x.end(), std::ref(f32rng));
 
@@ -49,21 +49,28 @@
   for (auto _ : state) {
     state.PauseTiming();
     float x_max = nanf("");
-    rmax(n * sizeof(float), x.data(), &x_max);
+    rmax(elements * sizeof(float), x.data(), &x_max);
     if (++buffer_index == num_buffers) {
       buffer_index = 0;
     }
     state.ResumeTiming();
 
     float y_sum = nanf("");
-    raddstoreexpminusmax(n * sizeof(float), x.data(), y.data() + buffer_index * packed_n, &y_sum, x_max);
+    raddstoreexpminusmax(elements * sizeof(float), x.data(), y.data() + buffer_index * packed_elements, &y_sum, x_max);
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
+  const size_t elements_per_iteration = elements;
   state.counters["elements"] =
-    benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+
+  const size_t bytes_per_iteration = 2 * elements * sizeof(float);
   state.counters["bytes"] =
-    benchmark::Counter(uint64_t(state.iterations()) * 2 * sizeof(float) * n, benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
 }
 
 static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
diff --git a/bench/f32-relu.cc b/bench/f32-relu.cc
index 7e30296..a0d14f1 100644
--- a/bench/f32-relu.cc
+++ b/bench/f32-relu.cc
@@ -21,28 +21,33 @@
   benchmark::State& state,
   xnn_f32_relu_ukernel_function f32_relu)
 {
-  const size_t n = state.range(0);
+  const size_t elements = state.range(0);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
 
-  std::vector<float, AlignedAllocator<float, 64>> x(n);
+  std::vector<float, AlignedAllocator<float, 64>> x(elements);
   std::generate(x.begin(), x.end(), std::ref(f32rng));
-  std::vector<float, AlignedAllocator<float, 64>> y(n);
+  std::vector<float, AlignedAllocator<float, 64>> y(elements);
   std::generate(x.begin(), x.end(), std::ref(f32rng));
 
   for (auto _ : state) {
-    f32_relu(n * sizeof(float), x.data(), y.data(), NULL);
+    f32_relu(elements * sizeof(float), x.data(), y.data(), NULL);
   }
 
-    state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
-    state.counters["elements"] =
-    benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate);
+  const size_t elements_per_iteration = elements;
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
 
+  const size_t bytes_per_iteration = 2 * elements * sizeof(float);
   state.counters["bytes"] =
-    benchmark::Counter(uint64_t(state.iterations()) * n * sizeof(float), benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
 }
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/bench/f32-rmax.cc b/bench/f32-rmax.cc
index 521c15c..a9b88aa 100644
--- a/bench/f32-rmax.cc
+++ b/bench/f32-rmax.cc
@@ -21,27 +21,32 @@
   benchmark::State& state,
   xnn_f32_rmax_ukernel_function f32_rmax)
 {
-  const size_t n = state.range(0);
+  const size_t elements = state.range(0);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
 
-  std::vector<float, AlignedAllocator<float, 64>> x(n);
+  std::vector<float, AlignedAllocator<float, 64>> x(elements);
   std::generate(x.begin(), x.end(), std::ref(f32rng));
 
   float y;
   for (auto _ : state) {
-    f32_rmax(n * sizeof(float), x.data(), &y);
+    f32_rmax(elements * sizeof(float), x.data(), &y);
   }
 
-    state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
-    state.counters["elements"] =
-    benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate);
+  const size_t elements_per_iteration = elements;
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
 
+  const size_t bytes_per_iteration = elements * sizeof(float);
   state.counters["bytes"] =
-    benchmark::Counter(uint64_t(state.iterations()) * n * sizeof(float), benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
 }
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/bench/f32-sigmoid.cc b/bench/f32-sigmoid.cc
index 86c7e84..631893d 100644
--- a/bench/f32-sigmoid.cc
+++ b/bench/f32-sigmoid.cc
@@ -36,7 +36,10 @@
     sigmoid(elements * sizeof(float), x.data(), y.data(), nullptr /* params */);
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = elements;
   state.counters["elements"] =
diff --git a/bench/f32-softmax.cc b/bench/f32-softmax.cc
index 6f6ed94..ecc4d10 100644
--- a/bench/f32-softmax.cc
+++ b/bench/f32-softmax.cc
@@ -27,18 +27,18 @@
 static void DNNLSoftArgMax(
   benchmark::State& state)
 {
-  const size_t n = state.range(0);
+  const size_t elements = state.range(0);
   const size_t cache_line_size_max = 128;
-  const size_t packed_n = benchmark::utils::RoundUp(n, cache_line_size_max / sizeof(float));
+  const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
 
   const size_t num_buffers = 1 +
-    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_n * sizeof(float));
-  std::vector<float> x(n);
-  std::vector<float> y(packed_n * num_buffers);
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
+  std::vector<float> x(elements);
+  std::vector<float> y(packed_elements * num_buffers);
 
   std::generate(x.begin(), x.end(), std::ref(f32rng));
 
@@ -48,7 +48,7 @@
     return;
   }
 
-  dnnl_dim_t input_output_shape[1] = { static_cast<int>(n) };
+  dnnl_dim_t input_output_shape[1] = { static_cast<int>(elements) };
 
   dnnl_memory_desc_t memory_descriptor = { 0 };
   if (dnnl_memory_desc_init_by_tag(
@@ -162,11 +162,18 @@
     return;
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
+  const size_t elements_per_iteration = elements;
   state.counters["elements"] =
-    benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+
+  const size_t bytes_per_iteration = 2 * elements * sizeof(float);
   state.counters["bytes"] =
-    benchmark::Counter(uint64_t(state.iterations()) * 2 * sizeof(float) * n, benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
 }
 #endif  // BENCHMARK_INTEL_DNNL
 
@@ -181,18 +188,18 @@
     return;
   }
 
-  const size_t n = state.range(0);
+  const size_t elements = state.range(0);
   const size_t cache_line_size_max = 128;
-  const size_t packed_n = benchmark::utils::RoundUp(n, cache_line_size_max / sizeof(float));
+  const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
 
   const size_t num_buffers = 1 +
-    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_n * sizeof(float));
-  std::vector<float> x(n);
-  std::vector<float> y(packed_n * num_buffers);
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
+  std::vector<float> x(elements);
+  std::vector<float> y(packed_elements * num_buffers);
 
   std::generate(x.begin(), x.end(), std::ref(f32rng));
 
@@ -207,10 +214,10 @@
 
     const auto start = std::chrono::high_resolution_clock::now();
     float x_max = nanf("");
-    rmax(n * sizeof(float), x.data(), &x_max);
+    rmax(elements * sizeof(float), x.data(), &x_max);
     float y_sum = nanf("");
-    raddexpminusmax(n * sizeof(float), x.data(), &y_sum, x_max);
-    vscaleexpminusmax(n * sizeof(float), x.data(), y.data() + packed_n * buffer_index, x_max, 1.0f / y_sum);
+    raddexpminusmax(elements * sizeof(float), x.data(), &y_sum, x_max);
+    vscaleexpminusmax(elements * sizeof(float), x.data(), y.data() + packed_elements * buffer_index, x_max, 1.0f / y_sum);
     const auto end = std::chrono::high_resolution_clock::now();
 
     const auto elapsed_seconds =
@@ -218,11 +225,18 @@
     state.SetIterationTime(elapsed_seconds.count());
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
+  const size_t elements_per_iteration = elements;
   state.counters["elements"] =
-    benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+
+  const size_t bytes_per_iteration = 2 * elements * sizeof(float);
   state.counters["bytes"] =
-    benchmark::Counter(uint64_t(state.iterations()) * 2 * sizeof(float) * n, benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
 }
 
 static void ThreePassSoftMaxWithReloading(
@@ -236,18 +250,18 @@
     return;
   }
 
-  const size_t n = state.range(0);
+  const size_t elements = state.range(0);
   const size_t cache_line_size_max = 128;
-  const size_t packed_n = benchmark::utils::RoundUp(n, cache_line_size_max / sizeof(float));
+  const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
 
   const size_t num_buffers = 1 +
-    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_n * sizeof(float));
-  std::vector<float> x(n);
-  std::vector<float> y(packed_n * num_buffers);
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
+  std::vector<float> x(elements);
+  std::vector<float> y(packed_elements * num_buffers);
 
   std::generate(x.begin(), x.end(), std::ref(f32rng));
 
@@ -262,10 +276,10 @@
 
     const auto start = std::chrono::high_resolution_clock::now();
     float x_max = nanf("");
-    rmax(n * sizeof(float), x.data(), &x_max);
+    rmax(elements * sizeof(float), x.data(), &x_max);
     float y_sum = nanf("");
-    raddstoreexpminusmax(n * sizeof(float), x.data(), y.data() + packed_n * buffer_index, &y_sum, x_max);
-    vscale(n * sizeof(float), y.data() + packed_n * buffer_index, y.data() + packed_n * buffer_index, 1.0f / y_sum);
+    raddstoreexpminusmax(elements * sizeof(float), x.data(), y.data() + packed_elements * buffer_index, &y_sum, x_max);
+    vscale(elements * sizeof(float), y.data() + packed_elements * buffer_index, y.data() + packed_elements * buffer_index, 1.0f / y_sum);
     const auto end = std::chrono::high_resolution_clock::now();
 
     const auto elapsed_seconds =
@@ -273,11 +287,18 @@
     state.SetIterationTime(elapsed_seconds.count());
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
+  const size_t elements_per_iteration = elements;
   state.counters["elements"] =
-    benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+
+  const size_t bytes_per_iteration = 2 * elements * sizeof(float);
   state.counters["bytes"] =
-    benchmark::Counter(uint64_t(state.iterations()) * 2 * sizeof(float) * n, benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
 }
 
 static void TwoPassSoftMax(
@@ -290,18 +311,18 @@
     return;
   }
 
-  const size_t n = state.range(0);
+  const size_t elements = state.range(0);
   const size_t cache_line_size_max = 128;
-  const size_t packed_n = benchmark::utils::RoundUp(n, cache_line_size_max / sizeof(float));
+  const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
 
   const size_t num_buffers = 1 +
-    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_n * sizeof(float));
-  std::vector<float> x(n);
-  std::vector<float> y(packed_n * num_buffers);
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
+  std::vector<float> x(elements);
+  std::vector<float> y(packed_elements * num_buffers);
 
   std::generate(x.begin(), x.end(), std::ref(f32rng));
 
@@ -316,8 +337,8 @@
 
     const auto start = std::chrono::high_resolution_clock::now();
     float scale[2];
-    raddextexp(n * sizeof(float), x.data(), scale);
-    vscaleextexp(n * sizeof(float), x.data(), y.data() + packed_n * buffer_index, 1.0f / scale[0], -scale[1]);
+    raddextexp(elements * sizeof(float), x.data(), scale);
+    vscaleextexp(elements * sizeof(float), x.data(), y.data() + packed_elements * buffer_index, 1.0f / scale[0], -scale[1]);
     const auto end = std::chrono::high_resolution_clock::now();
 
     const auto elapsed_seconds =
@@ -325,11 +346,18 @@
     state.SetIterationTime(elapsed_seconds.count());
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
+  const size_t elements_per_iteration = elements;
   state.counters["elements"] =
-    benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+
+  const size_t bytes_per_iteration = 2 * elements * sizeof(float);
   state.counters["bytes"] =
-    benchmark::Counter(uint64_t(state.iterations()) * 2 * sizeof(float) * n, benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
 }
 
 static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
diff --git a/bench/f32-spmm.cc b/bench/f32-spmm.cc
index c720072..4ffedef 100644
--- a/bench/f32-spmm.cc
+++ b/bench/f32-spmm.cc
@@ -150,7 +150,11 @@
       &params);
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["FLOPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 * mc * num_nonzeroes, benchmark::Counter::kIsRate);
 
diff --git a/bench/f32-vscaleexpminusmax.cc b/bench/f32-vscaleexpminusmax.cc
index 8f77d60..568a982 100644
--- a/bench/f32-vscaleexpminusmax.cc
+++ b/bench/f32-vscaleexpminusmax.cc
@@ -28,18 +28,18 @@
     return;
   }
 
-  const size_t n = state.range(0);
+  const size_t elements = state.range(0);
   const size_t cache_line_size_max = 128;
-  const size_t packed_n = benchmark::utils::RoundUp(n, cache_line_size_max / sizeof(float));
+  const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
 
   const size_t num_buffers = 1 +
-    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_n * sizeof(float));
-  std::vector<float, AlignedAllocator<float, 64>> x(n);
-  std::vector<float, AlignedAllocator<float, 64>> y(packed_n * num_buffers);
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
+  std::vector<float, AlignedAllocator<float, 64>> x(elements);
+  std::vector<float, AlignedAllocator<float, 64>> y(packed_elements * num_buffers);
 
   std::generate(x.begin(), x.end(), std::ref(f32rng));
 
@@ -49,22 +49,29 @@
   for (auto _ : state) {
     state.PauseTiming();
     float x_max = nanf("");
-    rmax(n * sizeof(float), x.data(), &x_max);
+    rmax(elements * sizeof(float), x.data(), &x_max);
     float y_sum = nanf("");
-    raddexpminusmax(n * sizeof(float), x.data(), &y_sum, x_max);
+    raddexpminusmax(elements * sizeof(float), x.data(), &y_sum, x_max);
     if (++buffer_index == num_buffers) {
       buffer_index = 0;
     }
     state.ResumeTiming();
 
-    vscaleexpminusmax(n * sizeof(float), x.data(), y.data() + packed_n * buffer_index, x_max, 1.0f / y_sum);
+    vscaleexpminusmax(elements * sizeof(float), x.data(), y.data() + packed_elements * buffer_index, x_max, 1.0f / y_sum);
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
+  const size_t elements_per_iteration = elements;
   state.counters["elements"] =
-    benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+
+  const size_t bytes_per_iteration = 2 * elements * sizeof(float);
   state.counters["bytes"] =
-    benchmark::Counter(uint64_t(state.iterations()) * 2 * sizeof(float) * n, benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
 }
 
 static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
diff --git a/bench/f32-vscaleextexp.cc b/bench/f32-vscaleextexp.cc
index e955e3a..23aa20f 100644
--- a/bench/f32-vscaleextexp.cc
+++ b/bench/f32-vscaleextexp.cc
@@ -26,9 +26,9 @@
     return;
   }
 
-  const size_t n = state.range(0);
+  const size_t elements = state.range(0);
   const size_t cache_line_size_max = 128;
-  const size_t packed_n = benchmark::utils::RoundUp(n, cache_line_size_max / sizeof(float));
+  const size_t packed_n = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -36,7 +36,7 @@
 
   const size_t num_buffers = 1 +
     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_n * sizeof(float));
-  std::vector<float, AlignedAllocator<float, 64>> x(n);
+  std::vector<float, AlignedAllocator<float, 64>> x(elements);
   std::vector<float, AlignedAllocator<float, 64>> y(packed_n * num_buffers);
 
   std::generate(x.begin(), x.end(), std::ref(f32rng));
@@ -47,7 +47,7 @@
   for (auto _ : state) {
     state.PauseTiming();
     float scale[2];
-    raddextexp(n * sizeof(float), x.data(), scale);
+    raddextexp(elements * sizeof(float), x.data(), scale);
     const float ext_mantissa = 1.0f / scale[0];
     const float ext_exponent = -scale[1];
     if (++buffer_index == num_buffers) {
@@ -55,14 +55,21 @@
     }
     state.ResumeTiming();
 
-    vscaleextexp(n * sizeof(float), x.data(), y.data() + packed_n * buffer_index, ext_mantissa, ext_exponent);
+    vscaleextexp(elements * sizeof(float), x.data(), y.data() + packed_n * buffer_index, ext_mantissa, ext_exponent);
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
+  const size_t elements_per_iteration = elements;
   state.counters["elements"] =
-    benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+
+  const size_t bytes_per_iteration = 2 * elements * sizeof(float);
   state.counters["bytes"] =
-    benchmark::Counter(uint64_t(state.iterations()) * 2 * sizeof(float) * n, benchmark::Counter::kIsRate);
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
 }
 
 static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
diff --git a/bench/floor.cc b/bench/floor.cc
index da46368..08fd7e9 100644
--- a/bench/floor.cc
+++ b/bench/floor.cc
@@ -77,7 +77,10 @@
     return;
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = batch_size * channels;
   state.counters["elements"] =
@@ -184,7 +187,10 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = batch_size * channels;
   state.counters["elements"] =
diff --git a/bench/global-average-pooling.cc b/bench/global-average-pooling.cc
index 517c679..e6dbebd 100644
--- a/bench/global-average-pooling.cc
+++ b/bench/global-average-pooling.cc
@@ -70,7 +70,11 @@
   }
   global_pooling_op = nullptr;
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["bytes"] = benchmark::Counter(
     uint64_t(state.iterations()) *
       batch_size * (input_height * input_width + 1) * channels * sizeof(uint8_t),
@@ -129,7 +133,11 @@
   }
   global_pooling_op = nullptr;
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["bytes"] = benchmark::Counter(
     uint64_t(state.iterations()) *
       batch_size * (input_height * input_width + 1) * channels * sizeof(int8_t),
@@ -189,7 +197,11 @@
   }
   global_pooling_op = nullptr;
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["bytes"] = benchmark::Counter(
     uint64_t(state.iterations()) *
       batch_size * (input_height * input_width + 1) * channels * sizeof(uint16_t),
@@ -244,7 +256,11 @@
   }
   global_pooling_op = nullptr;
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["bytes"] = benchmark::Counter(
     uint64_t(state.iterations()) *
       batch_size * (input_height * input_width + 1) * channels * sizeof(float),
diff --git a/bench/hardswish.cc b/bench/hardswish.cc
index 7eb91c4..385976d 100644
--- a/bench/hardswish.cc
+++ b/bench/hardswish.cc
@@ -79,7 +79,10 @@
     return;
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = batch_size * channels;
   state.counters["elements"] =
@@ -186,7 +189,10 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = batch_size * channels;
   state.counters["elements"] =
@@ -254,7 +260,10 @@
     return;
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = batch_size * channels;
   state.counters["elements"] =
diff --git a/bench/max-pooling.cc b/bench/max-pooling.cc
index ea52622..d69c1de 100644
--- a/bench/max-pooling.cc
+++ b/bench/max-pooling.cc
@@ -86,7 +86,11 @@
   }
   pooling_op = nullptr;
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["bytes"] = benchmark::Counter(
     uint64_t(state.iterations()) *
       batch_size * (input_height * input_width + output_height * output_width) * channels * sizeof(uint8_t),
@@ -159,7 +163,11 @@
   }
   pooling_op = nullptr;
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["bytes"] = benchmark::Counter(
     uint64_t(state.iterations()) *
       batch_size * (input_height * input_width + output_height * output_width) * channels * sizeof(float),
diff --git a/bench/prelu.cc b/bench/prelu.cc
index 7bed219..5c0700e 100644
--- a/bench/prelu.cc
+++ b/bench/prelu.cc
@@ -82,7 +82,10 @@
   }
   prelu_op = nullptr;
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = batch_size * height * width * channels;
   state.counters["elements"] =
@@ -207,7 +210,10 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = batch_size * height * width * channels;
   state.counters["elements"] =
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index 9b04607..f50452c 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -100,7 +100,11 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["OPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
 }
@@ -177,7 +181,11 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["OPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
 }
@@ -260,7 +268,11 @@
     ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["OPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
 }
diff --git a/bench/qu8-gemm.cc b/bench/qu8-gemm.cc
index 8934f6d..a64338a 100644
--- a/bench/qu8-gemm.cc
+++ b/bench/qu8-gemm.cc
@@ -104,7 +104,11 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["OPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
 }
@@ -194,7 +198,11 @@
         &threadingContext, AM, BM, &CM, 127, 127, outputPipeline);
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["OPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
 }
@@ -284,7 +292,11 @@
     ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
   state.counters["OPS"] = benchmark::Counter(
     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
 }
diff --git a/bench/softmax.cc b/bench/softmax.cc
index cfe1430..d77e860 100644
--- a/bench/softmax.cc
+++ b/bench/softmax.cc
@@ -78,7 +78,10 @@
     return;
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = batch_size * channels;
   state.counters["elements"] =
@@ -141,7 +144,10 @@
     return;
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = batch_size * channels;
   state.counters["elements"] =
@@ -255,7 +261,10 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = batch_size * channels;
   state.counters["elements"] =
diff --git a/bench/square-root.cc b/bench/square-root.cc
index 693dcb2..3c1f10b 100644
--- a/bench/square-root.cc
+++ b/bench/square-root.cc
@@ -77,7 +77,10 @@
     return;
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = batch_size * channels;
   state.counters["elements"] =
@@ -184,7 +187,10 @@
     }
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = batch_size * channels;
   state.counters["elements"] =
diff --git a/bench/truncation.cc b/bench/truncation.cc
index b87dc05..8d63645 100644
--- a/bench/truncation.cc
+++ b/bench/truncation.cc
@@ -68,7 +68,10 @@
     return;
   }
 
-  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
 
   const size_t elements_per_iteration = batch_size * channels;
   state.counters["elements"] =