Add checks for target ISA in microbenchmarks
Avoid running micro-kernels incompatible with the host processor
PiperOrigin-RevId: 282471857
diff --git a/bench/f32-dwconv-e2e.cc b/bench/f32-dwconv-e2e.cc
index f0cce9d..b4a58db 100644
--- a/bench/f32-dwconv-e2e.cc
+++ b/bench/f32-dwconv-e2e.cc
@@ -23,8 +23,12 @@
benchmark::State& state,
models::ExecutionPlanFactory model_factory,
xnn_f32_dwconv_up_ukernel_function dwconv,
- uint8_t cr, uint8_t mr)
+ uint8_t cr, uint8_t mr,
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
{
+ if (isa_check && !isa_check(state)) {
+ return;
+ }
if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
state.SkipWithError("failed to initialize XNNPACK");
return;
@@ -85,49 +89,49 @@
static void f32_dwconv_up4x9__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
DWConvEnd2EndBenchmark(state, model,
xnn_f32_dwconv_ukernel_up4x9__neon,
- 4 /* cr */, 9 /* mr */);
+ 4 /* cr */, 9 /* mr */, benchmark::utils::CheckNEON);
}
static void f32_dwconv_up4x9__neon_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
DWConvEnd2EndBenchmark(state, model,
xnn_f32_dwconv_ukernel_up4x9__neon_acc2,
- 4 /* cr */, 9 /* mr */);
+ 4 /* cr */, 9 /* mr */, benchmark::utils::CheckNEON);
}
static void f32_dwconv_up8x9__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
DWConvEnd2EndBenchmark(state, model,
xnn_f32_dwconv_ukernel_up8x9__neon,
- 8 /* cr */, 9 /* mr */);
+ 8 /* cr */, 9 /* mr */, benchmark::utils::CheckNEON);
}
static void f32_dwconv_up8x9__neon_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
DWConvEnd2EndBenchmark(state, model,
xnn_f32_dwconv_ukernel_up8x9__neon_acc2,
- 8 /* cr */, 9 /* mr */);
+ 8 /* cr */, 9 /* mr */, benchmark::utils::CheckNEON);
}
static void f32_dwconv_up4x9__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
DWConvEnd2EndBenchmark(state, model,
xnn_f32_dwconv_ukernel_up4x9__neonfma,
- 4 /* cr */, 9 /* mr */);
+ 4 /* cr */, 9 /* mr */, benchmark::utils::CheckNEONFMA);
}
static void f32_dwconv_up4x9__neonfma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
DWConvEnd2EndBenchmark(state, model,
xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2,
- 4 /* cr */, 9 /* mr */);
+ 4 /* cr */, 9 /* mr */, benchmark::utils::CheckNEONFMA);
}
static void f32_dwconv_up8x9__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
DWConvEnd2EndBenchmark(state, model,
xnn_f32_dwconv_ukernel_up8x9__neonfma,
- 8 /* cr */, 9 /* mr */);
+ 8 /* cr */, 9 /* mr */, benchmark::utils::CheckNEONFMA);
}
static void f32_dwconv_up8x9__neonfma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
DWConvEnd2EndBenchmark(state, model,
xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2,
- 8 /* cr */, 9 /* mr */);
+ 8 /* cr */, 9 /* mr */, benchmark::utils::CheckNEONFMA);
}
BENCHMARK_CAPTURE(f32_dwconv_up4x9__neon, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
@@ -184,49 +188,49 @@
static void f32_dwconv_up8x9__avx(benchmark::State& state, models::ExecutionPlanFactory model) {
DWConvEnd2EndBenchmark(state, model,
xnn_f32_dwconv_ukernel_up8x9__avx,
- 8 /* cr */, 9 /* mr */);
+ 8 /* cr */, 9 /* mr */, benchmark::utils::CheckAVX);
}
static void f32_dwconv_up8x9__avx_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
DWConvEnd2EndBenchmark(state, model,
xnn_f32_dwconv_ukernel_up8x9__avx_acc2,
- 8 /* cr */, 9 /* mr */);
+ 8 /* cr */, 9 /* mr */, benchmark::utils::CheckAVX);
}
static void f32_dwconv_up16x9__avx(benchmark::State& state, models::ExecutionPlanFactory model) {
DWConvEnd2EndBenchmark(state, model,
xnn_f32_dwconv_ukernel_up16x9__avx,
- 16 /* cr */, 9 /* mr */);
+ 16 /* cr */, 9 /* mr */, benchmark::utils::CheckAVX);
}
static void f32_dwconv_up16x9__avx_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
DWConvEnd2EndBenchmark(state, model,
xnn_f32_dwconv_ukernel_up16x9__avx_acc2,
- 16 /* cr */, 9 /* mr */);
+ 16 /* cr */, 9 /* mr */, benchmark::utils::CheckAVX);
}
static void f32_dwconv_up8x9__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
DWConvEnd2EndBenchmark(state, model,
xnn_f32_dwconv_ukernel_up8x9__fma3,
- 8 /* cr */, 9 /* mr */);
+ 8 /* cr */, 9 /* mr */, benchmark::utils::CheckFMA3);
}
static void f32_dwconv_up8x9__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
DWConvEnd2EndBenchmark(state, model,
xnn_f32_dwconv_ukernel_up8x9__fma3_acc2,
- 8 /* cr */, 9 /* mr */);
+ 8 /* cr */, 9 /* mr */, benchmark::utils::CheckFMA3);
}
static void f32_dwconv_up16x9__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
DWConvEnd2EndBenchmark(state, model,
xnn_f32_dwconv_ukernel_up16x9__fma3,
- 16 /* cr */, 9 /* mr */);
+ 16 /* cr */, 9 /* mr */, benchmark::utils::CheckFMA3);
}
static void f32_dwconv_up16x9__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
DWConvEnd2EndBenchmark(state, model,
xnn_f32_dwconv_ukernel_up16x9__fma3_acc2,
- 16 /* cr */, 9 /* mr */);
+ 16 /* cr */, 9 /* mr */, benchmark::utils::CheckFMA3);
}
BENCHMARK_CAPTURE(f32_dwconv_up4x9__sse, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
diff --git a/bench/f32-dwconv.cc b/bench/f32-dwconv.cc
index ce9da58..06110c5 100644
--- a/bench/f32-dwconv.cc
+++ b/bench/f32-dwconv.cc
@@ -27,12 +27,16 @@
static void DWConvBenchmark(benchmark::State& state,
xnn_f32_dwconv_up_ukernel_function dwconv,
- uint32_t cr, uint32_t kr)
+ uint32_t cr, uint32_t kr,
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
{
if (!cpuinfo_initialize()) {
state.SkipWithError("cpuinfo initialization failed");
return;
}
+ if (isa_check && !isa_check(state)) {
+ return;
+ }
const size_t input_height = state.range(0);
const size_t input_width = state.range(1);
@@ -164,15 +168,18 @@
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
static void f32_dwconv_4x9__neon(benchmark::State& state, const char* net) {
- DWConvBenchmark(state, xnn_f32_dwconv_ukernel_up4x9__neon, 4, 9);
+ DWConvBenchmark(state, xnn_f32_dwconv_ukernel_up4x9__neon, 4, 9,
+ benchmark::utils::CheckNEON);
}
static void f32_dwconv_4x9__neonfma(benchmark::State& state, const char* net) {
- DWConvBenchmark(state, xnn_f32_dwconv_ukernel_up4x9__neonfma, 4, 9);
+ DWConvBenchmark(state, xnn_f32_dwconv_ukernel_up4x9__neonfma, 4, 9,
+ benchmark::utils::CheckNEONFMA);
}
static void f32_dwconv_8x9__neonfma(benchmark::State& state, const char* net) {
- DWConvBenchmark(state, xnn_f32_dwconv_ukernel_up8x9__neonfma, 8, 9);
+ DWConvBenchmark(state, xnn_f32_dwconv_ukernel_up8x9__neonfma, 8, 9,
+ benchmark::utils::CheckNEONFMA);
}
BENCHMARK_DWCONV(f32_dwconv_4x9__neon)
diff --git a/bench/f32-gemm-e2e.cc b/bench/f32-gemm-e2e.cc
index a430991..d58e773 100644
--- a/bench/f32-gemm-e2e.cc
+++ b/bench/f32-gemm-e2e.cc
@@ -27,8 +27,12 @@
xnn_f32_igemm_ukernel_function igemm,
xnn_f32_gemm_ukernel_function gemm1,
xnn_f32_igemm_ukernel_function igemm1,
- uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0)
+ uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
{
+ if (isa_check && !isa_check(state)) {
+ return;
+ }
if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
state.SkipWithError("failed to initialize XNNPACK");
return;
@@ -265,7 +269,8 @@
xnn_f32_igemm_ukernel_4x8__neon_lane_ld64,
xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
- 4 /* mr */, 8 /* nr */);
+ 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
}
static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -274,7 +279,8 @@
xnn_f32_igemm_ukernel_4x8__neon_lane_ld128,
xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
- 4 /* mr */, 8 /* nr */);
+ 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
}
static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -283,7 +289,8 @@
xnn_f32_igemm_ukernel_6x8__neon_lane_ld64,
xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
- 6 /* mr */, 8 /* nr */);
+ 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
}
static void f32_gemm_4x8__neon_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
GEMMEnd2EndBenchmark(state, model,
@@ -291,7 +298,8 @@
xnn_f32_igemm_ukernel_4x8__neon_dup_ld64,
xnn_f32_gemm_ukernel_1x8__neon_dup_ld64,
xnn_f32_igemm_ukernel_1x8__neon_dup_ld64,
- 4 /* mr */, 8 /* nr */);
+ 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
}
static void f32_gemm_4x8__neon_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -300,7 +308,8 @@
xnn_f32_igemm_ukernel_4x8__neon_dup_ld128,
xnn_f32_gemm_ukernel_1x8__neon_dup_ld64,
xnn_f32_igemm_ukernel_1x8__neon_dup_ld64,
- 4 /* mr */, 8 /* nr */);
+ 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
}
static void f32_gemm_6x8__neon_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -309,7 +318,8 @@
xnn_f32_igemm_ukernel_6x8__neon_dup_ld64,
xnn_f32_gemm_ukernel_1x8__neon_dup_ld64,
xnn_f32_igemm_ukernel_1x8__neon_dup_ld64,
- 6 /* mr */, 8 /* nr */);
+ 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
}
static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -318,7 +328,8 @@
xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64,
xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64,
xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64,
- 4 /* mr */, 8 /* nr */);
+ 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEONFMA);
}
static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -327,7 +338,8 @@
xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128,
xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64,
xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64,
- 4 /* mr */, 8 /* nr */);
+ 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEONFMA);
}
static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -336,7 +348,8 @@
xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64,
xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64,
xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64,
- 6 /* mr */, 8 /* nr */);
+ 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEONFMA);
}
static void f32_gemm_4x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -345,7 +358,8 @@
xnn_f32_igemm_ukernel_4x8s4__neon,
xnn_f32_gemm_ukernel_1x8s4__neon,
xnn_f32_igemm_ukernel_1x8s4__neon,
- 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
+ 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
+ benchmark::utils::CheckNEON);
}
static void f32_gemm_4x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -354,7 +368,8 @@
xnn_f32_igemm_ukernel_4x8s4__neonfma,
xnn_f32_gemm_ukernel_1x8s4__neonfma,
xnn_f32_igemm_ukernel_1x8s4__neonfma,
- 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
+ 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
+ benchmark::utils::CheckNEONFMA);
}
static void f32_gemm_6x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -363,7 +378,8 @@
xnn_f32_igemm_ukernel_6x8s4__neon,
xnn_f32_gemm_ukernel_1x8s4__neon,
xnn_f32_igemm_ukernel_1x8s4__neon,
- 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
+ 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
+ benchmark::utils::CheckNEON);
}
static void f32_gemm_6x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -372,7 +388,8 @@
xnn_f32_igemm_ukernel_6x8s4__neonfma,
xnn_f32_gemm_ukernel_1x8s4__neonfma,
xnn_f32_igemm_ukernel_1x8s4__neonfma,
- 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
+ 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
+ benchmark::utils::CheckNEONFMA);
}
static void f32_gemm_8x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -381,7 +398,8 @@
xnn_f32_igemm_ukernel_8x8s4__neon,
xnn_f32_gemm_ukernel_1x8s4__neon,
xnn_f32_igemm_ukernel_1x8s4__neon,
- 8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
+ 8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
+ benchmark::utils::CheckNEON);
}
static void f32_gemm_8x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -390,7 +408,8 @@
xnn_f32_igemm_ukernel_8x8s4__neonfma,
xnn_f32_gemm_ukernel_1x8s4__neonfma,
xnn_f32_igemm_ukernel_1x8s4__neonfma,
- 8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
+ 8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
+ benchmark::utils::CheckNEONFMA);
}
BENCHMARK_CAPTURE(f32_gemm_4x8__neon_lane_ld64, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
@@ -420,7 +439,6 @@
BENCHMARK_CAPTURE(f32_gemm_6x8__neonfma_dup_ld64, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
BENCHMARK_CAPTURE(f32_gemm_6x8__neonfma_dup_ld64, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
-
BENCHMARK_CAPTURE(f32_gemm_4x8__neonfma_dup_ld64, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
BENCHMARK_CAPTURE(f32_gemm_4x8__neonfma_dup_ld64, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
@@ -430,8 +448,6 @@
BENCHMARK_CAPTURE(f32_gemm_6x8__neonfma_dup_ld64, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
BENCHMARK_CAPTURE(f32_gemm_6x8__neonfma_dup_ld64, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
-
-
BENCHMARK_CAPTURE(f32_gemm_4x8s4__neon, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
BENCHMARK_CAPTURE(f32_gemm_4x8s4__neon, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
@@ -487,7 +503,8 @@
xnn_f32_igemm_ukernel_4x8__avx_broadcast,
xnn_f32_gemm_ukernel_1x8__avx_broadcast,
xnn_f32_igemm_ukernel_1x8__avx_broadcast,
- 4 /* mr */, 8 /* nr */);
+ 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckAVX);
}
static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -496,7 +513,8 @@
xnn_f32_igemm_ukernel_5x8__avx_broadcast,
xnn_f32_gemm_ukernel_1x8__avx_broadcast,
xnn_f32_igemm_ukernel_1x8__avx_broadcast,
- 5 /* mr */, 8 /* nr */);
+ 5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckAVX);
}
static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -505,7 +523,8 @@
xnn_f32_igemm_ukernel_6x8__avx_broadcast,
xnn_f32_gemm_ukernel_1x8__avx_broadcast,
xnn_f32_igemm_ukernel_1x8__avx_broadcast,
- 6 /* mr */, 8 /* nr */);
+ 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckAVX);
}
static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -514,7 +533,8 @@
xnn_f32_igemm_ukernel_7x8__avx_broadcast,
xnn_f32_gemm_ukernel_1x8__avx_broadcast,
xnn_f32_igemm_ukernel_1x8__avx_broadcast,
- 7 /* mr */, 8 /* nr */);
+ 7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckAVX);
}
static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -523,7 +543,8 @@
xnn_f32_igemm_ukernel_4x8__fma3_broadcast,
xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
- 4 /* mr */, 8 /* nr */);
+ 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckFMA3);
}
static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -532,7 +553,8 @@
xnn_f32_igemm_ukernel_5x8__fma3_broadcast,
xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
- 5 /* mr */, 8 /* nr */);
+ 5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckFMA3);
}
static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -541,7 +563,8 @@
xnn_f32_igemm_ukernel_6x8__fma3_broadcast,
xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
- 6 /* mr */, 8 /* nr */);
+ 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckFMA3);
}
static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -550,7 +573,8 @@
xnn_f32_igemm_ukernel_7x8__fma3_broadcast,
xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
- 7 /* mr */, 8 /* nr */);
+ 7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckFMA3);
}
static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -559,7 +583,8 @@
xnn_f32_igemm_ukernel_8x8__fma3_broadcast,
xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
- 8 /* mr */, 8 /* nr */);
+ 8 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckFMA3);
}
BENCHMARK_CAPTURE(f32_gemm_4x8__sse_load1, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
diff --git a/bench/f32-gemm.cc b/bench/f32-gemm.cc
index a0cf3f0..1124d83 100644
--- a/bench/f32-gemm.cc
+++ b/bench/f32-gemm.cc
@@ -35,12 +35,16 @@
static void GEMMBenchmark(benchmark::State& state,
xnn_f32_gemm_ukernel_function gemm,
- size_t mr, size_t nr, size_t kr, size_t sr)
+ size_t mr, size_t nr, size_t kr, size_t sr,
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
{
if (!cpuinfo_initialize()) {
state.SkipWithError("cpuinfo initialization failed");
return;
}
+ if (isa_check && !isa_check(state)) {
+ return;
+ }
const size_t mc = state.range(0);
const size_t nc = state.range(1);
@@ -105,12 +109,16 @@
static void PPMM1PBenchmark(benchmark::State& state,
xnn_f32_ppmm_ukernel_function ppmm,
xnn_x32_packx_ukernel_function packx,
- size_t mr, size_t nr)
+ size_t mr, size_t nr,
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
{
if (!cpuinfo_initialize()) {
state.SkipWithError("cpuinfo initialization failed");
return;
}
+ if (isa_check && !isa_check(state)) {
+ return;
+ }
const size_t mc = state.range(0);
const size_t nc = state.range(1);
@@ -177,12 +185,16 @@
static void PPMM2PBenchmark(benchmark::State& state,
xnn_f32_ppmm_ukernel_function ppmm,
xnn_x32_packx_ukernel_function packx,
- size_t mr, size_t nr)
+ size_t mr, size_t nr,
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
{
if (!cpuinfo_initialize()) {
state.SkipWithError("cpuinfo initialization failed");
return;
}
+ if (isa_check && !isa_check(state)) {
+ return;
+ }
const size_t mc = state.range(0);
const size_t nc = state.range(1);
@@ -445,63 +457,63 @@
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
static void f32_gemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1, benchmark::utils::CheckNEON);
}
static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEON);
}
static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1, benchmark::utils::CheckNEON);
}
static void f32_gemm_5x8__neon_lane_ld64(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neon_lane_ld64, 5, 8, 1, 1);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neon_lane_ld64, 5, 8, 1, 1, benchmark::utils::CheckNEON);
}
static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1, benchmark::utils::CheckNEON);
}
static void f32_gemm_1x8s4__neon(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neon, 1, 8, 1, 4);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neon, 1, 8, 1, 4, benchmark::utils::CheckNEON);
}
static void f32_gemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neonfma, 1, 8, 1, 4);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neonfma, 1, 8, 1, 4, benchmark::utils::CheckNEONFMA);
}
static void f32_gemm_4x8s4__neon(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neon, 4, 8, 1, 4);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neon, 4, 8, 1, 4, benchmark::utils::CheckNEON);
}
static void f32_gemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neonfma, 4, 8, 1, 4);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neonfma, 4, 8, 1, 4, benchmark::utils::CheckNEONFMA);
}
static void f32_gemm_6x8s4__neon(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neon, 6, 8, 1, 4);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neon, 6, 8, 1, 4, benchmark::utils::CheckNEON);
}
static void f32_gemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neonfma, 6, 8, 1, 4);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neonfma, 6, 8, 1, 4, benchmark::utils::CheckNEONFMA);
}
static void f32_gemm_8x8s4__neon(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neon, 8, 8, 1, 4);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neon, 8, 8, 1, 4, benchmark::utils::CheckNEON);
}
static void f32_gemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neonfma, 8, 8, 1, 4);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neonfma, 8, 8, 1, 4, benchmark::utils::CheckNEONFMA);
}
static void f32_ppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
- PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
+ PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8, benchmark::utils::CheckNEONFMA);
}
static void f32_ppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) {
- PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
+ PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8, benchmark::utils::CheckNEONFMA);
}
BENCHMARK_GEMM(f32_gemm_1x8__neon_lane_ld64)
@@ -554,47 +566,47 @@
}
static void f32_gemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__avx_broadcast, 1, 8, 1, 1);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__avx_broadcast, 1, 8, 1, 1, benchmark::utils::CheckAVX);
}
static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__avx_broadcast, 4, 8, 1, 1);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__avx_broadcast, 4, 8, 1, 1, benchmark::utils::CheckAVX);
}
static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__avx_broadcast, 5, 8, 1, 1);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__avx_broadcast, 5, 8, 1, 1, benchmark::utils::CheckAVX);
}
static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__avx_broadcast, 6, 8, 1, 1);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__avx_broadcast, 6, 8, 1, 1, benchmark::utils::CheckAVX);
}
static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_7x8__avx_broadcast, 7, 8, 1, 1);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_7x8__avx_broadcast, 7, 8, 1, 1, benchmark::utils::CheckAVX);
}
static void f32_gemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1, benchmark::utils::CheckFMA3);
}
static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1, benchmark::utils::CheckFMA3);
}
static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1, benchmark::utils::CheckFMA3);
}
static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1, benchmark::utils::CheckFMA3);
}
static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1, benchmark::utils::CheckFMA3);
}
static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1);
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1, benchmark::utils::CheckFMA3);
}
BENCHMARK_GEMM(f32_gemm_1x8__sse_load1)
diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc
index fc82032..197f758 100644
--- a/bench/f32-igemm.cc
+++ b/bench/f32-igemm.cc
@@ -27,11 +27,15 @@
static void IGEMMBenchmark(benchmark::State& state,
xnn_f32_igemm_ukernel_function f32_igemm,
- uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr)
+ uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr,
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
{
if (!cpuinfo_initialize()) {
state.SkipWithError("cpuinfo initialization failed");
}
+ if (isa_check && !isa_check(state)) {
+ return;
+ }
const size_t input_height = state.range(0);
const size_t input_width = state.range(1);
@@ -152,71 +156,71 @@
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
static void f32_igemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1, benchmark::utils::CheckNEON);
}
static void f32_igemm_4x2__neon_lane_ld64(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x2__neon_lane_ld64, 4, 2, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x2__neon_lane_ld64, 4, 2, 1, 1, benchmark::utils::CheckNEON);
}
static void f32_igemm_4x4__neon_lane_ld64(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x4__neon_lane_ld64, 4, 4, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x4__neon_lane_ld64, 4, 4, 1, 1, benchmark::utils::CheckNEON);
}
static void f32_igemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1, benchmark::utils::CheckNEON);
}
static void f32_igemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEON);
}
static void f32_igemm_1x8__neon_dup_ld64(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neon_dup_ld64, 1, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neon_dup_ld64, 1, 8, 1, 1, benchmark::utils::CheckNEON);
}
static void f32_igemm_4x8__neon_dup_ld128(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_dup_ld128, 4, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_dup_ld128, 4, 8, 1, 1, benchmark::utils::CheckNEON);
}
static void f32_igemm_4x8__neon_dup_ld64(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_dup_ld64, 4, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_dup_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEON);
}
static void f32_igemm_6x8__neon_dup_ld64(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neon_dup_ld64, 6, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neon_dup_ld64, 6, 8, 1, 1, benchmark::utils::CheckNEON);
}
static void f32_igemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1, benchmark::utils::CheckNEONFMA);
}
static void f32_igemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1, benchmark::utils::CheckNEONFMA);
}
static void f32_igemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEONFMA);
}
static void f32_igemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1, benchmark::utils::CheckNEONFMA);
}
static void f32_igemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1, benchmark::utils::CheckNEON);
}
static void f32_igemm_1x8s4__neon(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8s4__neon, 1, 8, 1, 4);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8s4__neon, 1, 8, 1, 4, benchmark::utils::CheckNEON);
}
static void f32_igemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8s4__neonfma, 1, 8, 1, 4);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8s4__neonfma, 1, 8, 1, 4, benchmark::utils::CheckNEONFMA);
}
static void f32_igemm_4x8s4__neon(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8s4__neon, 4, 8, 1, 4);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8s4__neon, 4, 8, 1, 4, benchmark::utils::CheckNEON);
}
static void f32_igemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8s4__neonfma, 4, 8, 1, 4);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8s4__neonfma, 4, 8, 1, 4, benchmark::utils::CheckNEONFMA);
}
static void f32_igemm_6x8s4__neon(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8s4__neon, 6, 8, 1, 4);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8s4__neon, 6, 8, 1, 4, benchmark::utils::CheckNEON);
}
static void f32_igemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8s4__neonfma, 6, 8, 1, 4);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8s4__neonfma, 6, 8, 1, 4, benchmark::utils::CheckNEONFMA);
}
static void f32_igemm_8x8s4__neon(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_8x8s4__neon, 8, 8, 1, 4);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_8x8s4__neon, 8, 8, 1, 4, benchmark::utils::CheckNEON);
}
static void f32_igemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_8x8s4__neonfma, 8, 8, 1, 4);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_8x8s4__neonfma, 8, 8, 1, 4, benchmark::utils::CheckNEONFMA);
}
BENCHMARK_CONV(f32_igemm_1x8__neon_lane_ld64)
@@ -362,47 +366,47 @@
}
static void f32_igemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__avx_broadcast, 1, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__avx_broadcast, 1, 8, 1, 1, benchmark::utils::CheckAVX);
}
static void f32_igemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__avx_broadcast, 4, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__avx_broadcast, 4, 8, 1, 1, benchmark::utils::CheckAVX);
}
static void f32_igemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_5x8__avx_broadcast, 5, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_5x8__avx_broadcast, 5, 8, 1, 1, benchmark::utils::CheckAVX);
}
static void f32_igemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__avx_broadcast, 6, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__avx_broadcast, 6, 8, 1, 1, benchmark::utils::CheckAVX);
}
static void f32_igemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_7x8__avx_broadcast, 7, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_7x8__avx_broadcast, 7, 8, 1, 1, benchmark::utils::CheckAVX);
}
static void f32_igemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1, benchmark::utils::CheckFMA3);
}
static void f32_igemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1, benchmark::utils::CheckFMA3);
}
static void f32_igemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1, benchmark::utils::CheckFMA3);
}
static void f32_igemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1, benchmark::utils::CheckFMA3);
}
static void f32_igemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1, benchmark::utils::CheckFMA3);
}
static void f32_igemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1);
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1, benchmark::utils::CheckFMA3);
}
BENCHMARK_CONV(f32_igemm_1x8__sse_load1)
diff --git a/bench/utils.cc b/bench/utils.cc
index 0f72c4b..9dc85d7 100644
--- a/bench/utils.cc
+++ b/bench/utils.cc
@@ -255,5 +255,62 @@
}
}
+
+bool CheckNEON(benchmark::State& state) {
+ if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon()) {
+ state.SkipWithError("no NEON extension");
+ return false;
+ }
+ return true;
+}
+
+bool CheckNEONFMA(benchmark::State& state) {
+ if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fma()) {
+ state.SkipWithError("no NEON-FMA extension");
+ return false;
+ }
+ return true;
+}
+
+bool CheckSSE41(benchmark::State& state) {
+ if (!cpuinfo_initialize() || !cpuinfo_has_x86_sse4_1()) {
+ state.SkipWithError("no SSE4.1 extension");
+ return false;
+ }
+ return true;
+}
+
+bool CheckAVX(benchmark::State& state) {
+ if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx()) {
+ state.SkipWithError("no AVX extension");
+ return false;
+ }
+ return true;
+}
+
+bool CheckFMA3(benchmark::State& state) {
+ if (!cpuinfo_initialize() || !cpuinfo_has_x86_fma3()) {
+ state.SkipWithError("no FMA3 extension");
+ return false;
+ }
+ return true;
+}
+
+bool CheckAVX2(benchmark::State& state) {
+ if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx2()) {
+ state.SkipWithError("no AVX2 extension");
+ return false;
+ }
+ return true;
+}
+
+bool CheckAVX512F(benchmark::State& state) {
+ if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx512f()) {
+ state.SkipWithError("no AVX512F extension");
+ return false;
+ }
+ return true;
+}
+
} // namespace utils
} // namespace benchmark
diff --git a/bench/utils.h b/bench/utils.h
index 8f21af6..072b14d 100644
--- a/bench/utils.h
+++ b/bench/utils.h
@@ -29,6 +29,36 @@
// Set multi-threading parameters appropriate for the processor.
void MultiThreadingParameters(benchmark::internal::Benchmark* benchmark);
+typedef bool (*IsaCheckFunction)(benchmark::State& state);
+
+// Check if ARM NEON extension is supported.
+// If NEON is unsupported, report error in benchmark state, and return false.
+bool CheckNEON(benchmark::State& state);
+
+// Check if ARM NEON-FMA extension is supported.
+// If NEON-FMA is unsupported, report error in benchmark state, and return false.
+bool CheckNEONFMA(benchmark::State& state);
+
+// Check if x86 SSE4.1 extension is supported.
+// If SSE4.1 is unsupported, report error in benchmark state, and return false.
+bool CheckSSE41(benchmark::State& state);
+
+// Check if x86 AVX extension is supported.
+// If AVX is unsupported, report error in benchmark state, and return false.
+bool CheckAVX(benchmark::State& state);
+
+// Check if x86 FMA3 extension is supported.
+// If FMA3 is unsupported, report error in benchmark state, and return false.
+bool CheckFMA3(benchmark::State& state);
+
+// Check if x86 AVX2 extension is supported.
+// If AVX2 is unsupported, report error in benchmark state, and return false.
+bool CheckAVX2(benchmark::State& state);
+
+// Check if x86 AVX512F extension is supported.
+// If AVX512F is unsupported, report error in benchmark state, and return false.
+bool CheckAVX512F(benchmark::State& state);
+
template <class T>
inline T DivideRoundUp(T x, T q) {
return x / q + T(x % q != 0);