diff --git a/bench/f32-dwconv-e2e.cc b/bench/f32-dwconv-e2e.cc
index f0cce9d..b4a58db 100644
--- a/bench/f32-dwconv-e2e.cc
+++ b/bench/f32-dwconv-e2e.cc
@@ -23,8 +23,12 @@
   benchmark::State& state,
   models::ExecutionPlanFactory model_factory, 
   xnn_f32_dwconv_up_ukernel_function dwconv,
-  uint8_t cr, uint8_t mr)
+  uint8_t cr, uint8_t mr,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
 {
+  if (isa_check && !isa_check(state)) {
+    return;
+  }
   if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
     state.SkipWithError("failed to initialize XNNPACK");
     return;
@@ -85,49 +89,49 @@
   static void f32_dwconv_up4x9__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
     DWConvEnd2EndBenchmark(state, model,
       xnn_f32_dwconv_ukernel_up4x9__neon,
-      4 /* cr */, 9 /* mr */);
+      4 /* cr */, 9 /* mr */, benchmark::utils::CheckNEON);
   }
 
   static void f32_dwconv_up4x9__neon_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
     DWConvEnd2EndBenchmark(state, model,
       xnn_f32_dwconv_ukernel_up4x9__neon_acc2,
-      4 /* cr */, 9 /* mr */);
+      4 /* cr */, 9 /* mr */, benchmark::utils::CheckNEON);
   }
 
   static void f32_dwconv_up8x9__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
     DWConvEnd2EndBenchmark(state, model,
       xnn_f32_dwconv_ukernel_up8x9__neon,
-      8 /* cr */, 9 /* mr */);
+      8 /* cr */, 9 /* mr */, benchmark::utils::CheckNEON);
   }
 
   static void f32_dwconv_up8x9__neon_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
     DWConvEnd2EndBenchmark(state, model,
       xnn_f32_dwconv_ukernel_up8x9__neon_acc2,
-      8 /* cr */, 9 /* mr */);
+      8 /* cr */, 9 /* mr */, benchmark::utils::CheckNEON);
   }
 
   static void f32_dwconv_up4x9__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
     DWConvEnd2EndBenchmark(state, model,
       xnn_f32_dwconv_ukernel_up4x9__neonfma,
-      4 /* cr */, 9 /* mr */);
+      4 /* cr */, 9 /* mr */, benchmark::utils::CheckNEONFMA);
   }
 
   static void f32_dwconv_up4x9__neonfma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
     DWConvEnd2EndBenchmark(state, model,
       xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2,
-      4 /* cr */, 9 /* mr */);
+      4 /* cr */, 9 /* mr */, benchmark::utils::CheckNEONFMA);
   }
 
   static void f32_dwconv_up8x9__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
     DWConvEnd2EndBenchmark(state, model,
       xnn_f32_dwconv_ukernel_up8x9__neonfma,
-      8 /* cr */, 9 /* mr */);
+      8 /* cr */, 9 /* mr */, benchmark::utils::CheckNEONFMA);
   }
 
   static void f32_dwconv_up8x9__neonfma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
     DWConvEnd2EndBenchmark(state, model,
       xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2,
-      8 /* cr */, 9 /* mr */);
+      8 /* cr */, 9 /* mr */, benchmark::utils::CheckNEONFMA);
   }
 
   BENCHMARK_CAPTURE(f32_dwconv_up4x9__neon, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
@@ -184,49 +188,49 @@
   static void f32_dwconv_up8x9__avx(benchmark::State& state, models::ExecutionPlanFactory model) {
     DWConvEnd2EndBenchmark(state, model,
       xnn_f32_dwconv_ukernel_up8x9__avx,
-      8 /* cr */, 9 /* mr */);
+      8 /* cr */, 9 /* mr */, benchmark::utils::CheckAVX);
   }
 
   static void f32_dwconv_up8x9__avx_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
     DWConvEnd2EndBenchmark(state, model,
       xnn_f32_dwconv_ukernel_up8x9__avx_acc2,
-      8 /* cr */, 9 /* mr */);
+      8 /* cr */, 9 /* mr */, benchmark::utils::CheckAVX);
   }
 
   static void f32_dwconv_up16x9__avx(benchmark::State& state, models::ExecutionPlanFactory model) {
     DWConvEnd2EndBenchmark(state, model,
       xnn_f32_dwconv_ukernel_up16x9__avx,
-      16 /* cr */, 9 /* mr */);
+      16 /* cr */, 9 /* mr */, benchmark::utils::CheckAVX);
   }
 
   static void f32_dwconv_up16x9__avx_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
     DWConvEnd2EndBenchmark(state, model,
       xnn_f32_dwconv_ukernel_up16x9__avx_acc2,
-      16 /* cr */, 9 /* mr */);
+      16 /* cr */, 9 /* mr */, benchmark::utils::CheckAVX);
   }
 
   static void f32_dwconv_up8x9__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
     DWConvEnd2EndBenchmark(state, model,
       xnn_f32_dwconv_ukernel_up8x9__fma3,
-      8 /* cr */, 9 /* mr */);
+      8 /* cr */, 9 /* mr */, benchmark::utils::CheckFMA3);
   }
 
   static void f32_dwconv_up8x9__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
     DWConvEnd2EndBenchmark(state, model,
       xnn_f32_dwconv_ukernel_up8x9__fma3_acc2,
-      8 /* cr */, 9 /* mr */);
+      8 /* cr */, 9 /* mr */, benchmark::utils::CheckFMA3);
   }
 
   static void f32_dwconv_up16x9__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
     DWConvEnd2EndBenchmark(state, model,
       xnn_f32_dwconv_ukernel_up16x9__fma3,
-      16 /* cr */, 9 /* mr */);
+      16 /* cr */, 9 /* mr */, benchmark::utils::CheckFMA3);
   }
 
   static void f32_dwconv_up16x9__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
     DWConvEnd2EndBenchmark(state, model,
       xnn_f32_dwconv_ukernel_up16x9__fma3_acc2,
-      16 /* cr */, 9 /* mr */);
+      16 /* cr */, 9 /* mr */, benchmark::utils::CheckFMA3);
   }
 
   BENCHMARK_CAPTURE(f32_dwconv_up4x9__sse, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
diff --git a/bench/f32-dwconv.cc b/bench/f32-dwconv.cc
index ce9da58..06110c5 100644
--- a/bench/f32-dwconv.cc
+++ b/bench/f32-dwconv.cc
@@ -27,12 +27,16 @@
 
 static void DWConvBenchmark(benchmark::State& state,
   xnn_f32_dwconv_up_ukernel_function dwconv,
-  uint32_t cr, uint32_t kr)
+  uint32_t cr, uint32_t kr,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
 {
   if (!cpuinfo_initialize()) {
     state.SkipWithError("cpuinfo initialization failed");
     return;
   }
+  if (isa_check && !isa_check(state)) {
+    return;
+  }
 
   const size_t input_height = state.range(0);
   const size_t input_width = state.range(1);
@@ -164,15 +168,18 @@
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   static void f32_dwconv_4x9__neon(benchmark::State& state, const char* net) {
-    DWConvBenchmark(state, xnn_f32_dwconv_ukernel_up4x9__neon, 4, 9);
+    DWConvBenchmark(state, xnn_f32_dwconv_ukernel_up4x9__neon, 4, 9,
+      benchmark::utils::CheckNEON);
   }
 
   static void f32_dwconv_4x9__neonfma(benchmark::State& state, const char* net) {
-    DWConvBenchmark(state, xnn_f32_dwconv_ukernel_up4x9__neonfma, 4, 9);
+    DWConvBenchmark(state, xnn_f32_dwconv_ukernel_up4x9__neonfma, 4, 9,
+      benchmark::utils::CheckNEONFMA);
   }
 
   static void f32_dwconv_8x9__neonfma(benchmark::State& state, const char* net) {
-    DWConvBenchmark(state, xnn_f32_dwconv_ukernel_up8x9__neonfma, 8, 9);
+    DWConvBenchmark(state, xnn_f32_dwconv_ukernel_up8x9__neonfma, 8, 9,
+      benchmark::utils::CheckNEONFMA);
   }
 
   BENCHMARK_DWCONV(f32_dwconv_4x9__neon)
diff --git a/bench/f32-gemm-e2e.cc b/bench/f32-gemm-e2e.cc
index a430991..d58e773 100644
--- a/bench/f32-gemm-e2e.cc
+++ b/bench/f32-gemm-e2e.cc
@@ -27,8 +27,12 @@
   xnn_f32_igemm_ukernel_function igemm,
   xnn_f32_gemm_ukernel_function gemm1,
   xnn_f32_igemm_ukernel_function igemm1,
-  uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0)
+  uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
 {
+  if (isa_check && !isa_check(state)) {
+    return;
+  }
   if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
     state.SkipWithError("failed to initialize XNNPACK");
     return;
@@ -265,7 +269,8 @@
       xnn_f32_igemm_ukernel_4x8__neon_lane_ld64,
       xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
       xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
-      4 /* mr */, 8 /* nr */);
+      4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
   }
 
   static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -274,7 +279,8 @@
       xnn_f32_igemm_ukernel_4x8__neon_lane_ld128,
       xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
       xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
-      4 /* mr */, 8 /* nr */);
+      4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
   }
 
   static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -283,7 +289,8 @@
       xnn_f32_igemm_ukernel_6x8__neon_lane_ld64,
       xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
       xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
-      6 /* mr */, 8 /* nr */);
+      6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
   }
   static void f32_gemm_4x8__neon_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
     GEMMEnd2EndBenchmark(state, model,
@@ -291,7 +298,8 @@
       xnn_f32_igemm_ukernel_4x8__neon_dup_ld64,
       xnn_f32_gemm_ukernel_1x8__neon_dup_ld64,
       xnn_f32_igemm_ukernel_1x8__neon_dup_ld64,
-      4 /* mr */, 8 /* nr */);
+      4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
   }
 
   static void f32_gemm_4x8__neon_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -300,7 +308,8 @@
       xnn_f32_igemm_ukernel_4x8__neon_dup_ld128,
       xnn_f32_gemm_ukernel_1x8__neon_dup_ld64,
       xnn_f32_igemm_ukernel_1x8__neon_dup_ld64,
-      4 /* mr */, 8 /* nr */);
+      4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
   }
 
   static void f32_gemm_6x8__neon_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -309,7 +318,8 @@
       xnn_f32_igemm_ukernel_6x8__neon_dup_ld64,
       xnn_f32_gemm_ukernel_1x8__neon_dup_ld64,
       xnn_f32_igemm_ukernel_1x8__neon_dup_ld64,
-      6 /* mr */, 8 /* nr */);
+      6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
   }
 
   static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -318,7 +328,8 @@
       xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64,
       xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64,
       xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64,
-      4 /* mr */, 8 /* nr */);
+      4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONFMA);
   }
 
   static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -327,7 +338,8 @@
       xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128,
       xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64,
       xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64,
-      4 /* mr */, 8 /* nr */);
+      4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONFMA);
   }
 
   static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -336,7 +348,8 @@
       xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64,
       xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64,
       xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64,
-      6 /* mr */, 8 /* nr */);
+      6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONFMA);
   }
 
   static void f32_gemm_4x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -345,7 +358,8 @@
       xnn_f32_igemm_ukernel_4x8s4__neon,
       xnn_f32_gemm_ukernel_1x8s4__neon,
       xnn_f32_igemm_ukernel_1x8s4__neon,
-      4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
+      4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
+      benchmark::utils::CheckNEON);
   }
 
   static void f32_gemm_4x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -354,7 +368,8 @@
       xnn_f32_igemm_ukernel_4x8s4__neonfma,
       xnn_f32_gemm_ukernel_1x8s4__neonfma,
       xnn_f32_igemm_ukernel_1x8s4__neonfma,
-      4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
+      4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
+      benchmark::utils::CheckNEONFMA);
   }
 
   static void f32_gemm_6x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -363,7 +378,8 @@
       xnn_f32_igemm_ukernel_6x8s4__neon,
       xnn_f32_gemm_ukernel_1x8s4__neon,
       xnn_f32_igemm_ukernel_1x8s4__neon,
-      6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
+      6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
+      benchmark::utils::CheckNEON);
   }
 
   static void f32_gemm_6x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -372,7 +388,8 @@
       xnn_f32_igemm_ukernel_6x8s4__neonfma,
       xnn_f32_gemm_ukernel_1x8s4__neonfma,
       xnn_f32_igemm_ukernel_1x8s4__neonfma,
-      6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
+      6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
+      benchmark::utils::CheckNEONFMA);
   }
 
   static void f32_gemm_8x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -381,7 +398,8 @@
       xnn_f32_igemm_ukernel_8x8s4__neon,
       xnn_f32_gemm_ukernel_1x8s4__neon,
       xnn_f32_igemm_ukernel_1x8s4__neon,
-      8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
+      8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
+      benchmark::utils::CheckNEON);
   }
 
   static void f32_gemm_8x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -390,7 +408,8 @@
       xnn_f32_igemm_ukernel_8x8s4__neonfma,
       xnn_f32_gemm_ukernel_1x8s4__neonfma,
       xnn_f32_igemm_ukernel_1x8s4__neonfma,
-      8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
+      8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
+      benchmark::utils::CheckNEONFMA);
   }
 
   BENCHMARK_CAPTURE(f32_gemm_4x8__neon_lane_ld64, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
@@ -420,7 +439,6 @@
   BENCHMARK_CAPTURE(f32_gemm_6x8__neonfma_dup_ld64, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
   BENCHMARK_CAPTURE(f32_gemm_6x8__neonfma_dup_ld64, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
 
-
   BENCHMARK_CAPTURE(f32_gemm_4x8__neonfma_dup_ld64, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
   BENCHMARK_CAPTURE(f32_gemm_4x8__neonfma_dup_ld64, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
 
@@ -430,8 +448,6 @@
   BENCHMARK_CAPTURE(f32_gemm_6x8__neonfma_dup_ld64, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
   BENCHMARK_CAPTURE(f32_gemm_6x8__neonfma_dup_ld64, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
 
-
-
   BENCHMARK_CAPTURE(f32_gemm_4x8s4__neon, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
   BENCHMARK_CAPTURE(f32_gemm_4x8s4__neon, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
 
@@ -487,7 +503,8 @@
       xnn_f32_igemm_ukernel_4x8__avx_broadcast,
       xnn_f32_gemm_ukernel_1x8__avx_broadcast,
       xnn_f32_igemm_ukernel_1x8__avx_broadcast,
-      4 /* mr */, 8 /* nr */);
+      4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX);
   }
 
   static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -496,7 +513,8 @@
       xnn_f32_igemm_ukernel_5x8__avx_broadcast,
       xnn_f32_gemm_ukernel_1x8__avx_broadcast,
       xnn_f32_igemm_ukernel_1x8__avx_broadcast,
-      5 /* mr */, 8 /* nr */);
+      5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX);
   }
 
   static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -505,7 +523,8 @@
       xnn_f32_igemm_ukernel_6x8__avx_broadcast,
       xnn_f32_gemm_ukernel_1x8__avx_broadcast,
       xnn_f32_igemm_ukernel_1x8__avx_broadcast,
-      6 /* mr */, 8 /* nr */);
+      6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX);
   }
 
   static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -514,7 +533,8 @@
       xnn_f32_igemm_ukernel_7x8__avx_broadcast,
       xnn_f32_gemm_ukernel_1x8__avx_broadcast,
       xnn_f32_igemm_ukernel_1x8__avx_broadcast,
-      7 /* mr */, 8 /* nr */);
+      7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX);
   }
 
   static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -523,7 +543,8 @@
       xnn_f32_igemm_ukernel_4x8__fma3_broadcast,
       xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
       xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
-      4 /* mr */, 8 /* nr */);
+      4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckFMA3);
   }
 
   static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -532,7 +553,8 @@
       xnn_f32_igemm_ukernel_5x8__fma3_broadcast,
       xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
       xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
-      5 /* mr */, 8 /* nr */);
+      5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckFMA3);
   }
 
   static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -541,7 +563,8 @@
       xnn_f32_igemm_ukernel_6x8__fma3_broadcast,
       xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
       xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
-      6 /* mr */, 8 /* nr */);
+      6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckFMA3);
   }
 
   static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -550,7 +573,8 @@
       xnn_f32_igemm_ukernel_7x8__fma3_broadcast,
       xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
       xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
-      7 /* mr */, 8 /* nr */);
+      7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckFMA3);
   }
 
   static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -559,7 +583,8 @@
       xnn_f32_igemm_ukernel_8x8__fma3_broadcast,
       xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
       xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
-      8 /* mr */, 8 /* nr */);
+      8 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckFMA3);
   }
 
   BENCHMARK_CAPTURE(f32_gemm_4x8__sse_load1, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
diff --git a/bench/f32-gemm.cc b/bench/f32-gemm.cc
index a0cf3f0..1124d83 100644
--- a/bench/f32-gemm.cc
+++ b/bench/f32-gemm.cc
@@ -35,12 +35,16 @@
 
 static void GEMMBenchmark(benchmark::State& state,
   xnn_f32_gemm_ukernel_function gemm,
-  size_t mr, size_t nr, size_t kr, size_t sr)
+  size_t mr, size_t nr, size_t kr, size_t sr,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
 {
   if (!cpuinfo_initialize()) {
     state.SkipWithError("cpuinfo initialization failed");
     return;
   }
+  if (isa_check && !isa_check(state)) {
+    return;
+  }
 
   const size_t mc = state.range(0);
   const size_t nc = state.range(1);
@@ -105,12 +109,16 @@
 static void PPMM1PBenchmark(benchmark::State& state,
   xnn_f32_ppmm_ukernel_function ppmm,
   xnn_x32_packx_ukernel_function packx,
-  size_t mr, size_t nr)
+  size_t mr, size_t nr,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
 {
   if (!cpuinfo_initialize()) {
     state.SkipWithError("cpuinfo initialization failed");
     return;
   }
+  if (isa_check && !isa_check(state)) {
+    return;
+  }
 
   const size_t mc = state.range(0);
   const size_t nc = state.range(1);
@@ -177,12 +185,16 @@
 static void PPMM2PBenchmark(benchmark::State& state,
   xnn_f32_ppmm_ukernel_function ppmm,
   xnn_x32_packx_ukernel_function packx,
-  size_t mr, size_t nr)
+  size_t mr, size_t nr,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
 {
   if (!cpuinfo_initialize()) {
     state.SkipWithError("cpuinfo initialization failed");
     return;
   }
+  if (isa_check && !isa_check(state)) {
+    return;
+  }
 
   const size_t mc = state.range(0);
   const size_t nc = state.range(1);
@@ -445,63 +457,63 @@
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   static void f32_gemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1, benchmark::utils::CheckNEON);
   }
 
   static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEON);
   }
 
   static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1, benchmark::utils::CheckNEON);
   }
 
   static void f32_gemm_5x8__neon_lane_ld64(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neon_lane_ld64, 5, 8, 1, 1);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neon_lane_ld64, 5, 8, 1, 1, benchmark::utils::CheckNEON);
   }
 
   static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1, benchmark::utils::CheckNEON);
   }
 
   static void f32_gemm_1x8s4__neon(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neon, 1, 8, 1, 4);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neon, 1, 8, 1, 4, benchmark::utils::CheckNEON);
   }
 
   static void f32_gemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neonfma, 1, 8, 1, 4);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neonfma, 1, 8, 1, 4, benchmark::utils::CheckNEONFMA);
   }
 
   static void f32_gemm_4x8s4__neon(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neon, 4, 8, 1, 4);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neon, 4, 8, 1, 4, benchmark::utils::CheckNEON);
   }
 
   static void f32_gemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neonfma, 4, 8, 1, 4);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neonfma, 4, 8, 1, 4, benchmark::utils::CheckNEONFMA);
   }
 
   static void f32_gemm_6x8s4__neon(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neon, 6, 8, 1, 4);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neon, 6, 8, 1, 4, benchmark::utils::CheckNEON);
   }
 
   static void f32_gemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neonfma, 6, 8, 1, 4);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neonfma, 6, 8, 1, 4, benchmark::utils::CheckNEONFMA);
   }
 
   static void f32_gemm_8x8s4__neon(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neon, 8, 8, 1, 4);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neon, 8, 8, 1, 4, benchmark::utils::CheckNEON);
   }
 
   static void f32_gemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neonfma, 8, 8, 1, 4);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neonfma, 8, 8, 1, 4, benchmark::utils::CheckNEONFMA);
   }
 
   static void f32_ppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
-    PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
+    PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8, benchmark::utils::CheckNEONFMA);
   }
 
   static void f32_ppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) {
-    PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
+    PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8, benchmark::utils::CheckNEONFMA);
   }
 
   BENCHMARK_GEMM(f32_gemm_1x8__neon_lane_ld64)
@@ -554,47 +566,47 @@
   }
 
   static void f32_gemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__avx_broadcast, 1, 8, 1, 1);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__avx_broadcast, 1, 8, 1, 1, benchmark::utils::CheckAVX);
   }
 
   static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__avx_broadcast, 4, 8, 1, 1);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__avx_broadcast, 4, 8, 1, 1, benchmark::utils::CheckAVX);
   }
 
   static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__avx_broadcast, 5, 8, 1, 1);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__avx_broadcast, 5, 8, 1, 1, benchmark::utils::CheckAVX);
   }
 
   static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__avx_broadcast, 6, 8, 1, 1);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__avx_broadcast, 6, 8, 1, 1, benchmark::utils::CheckAVX);
   }
 
   static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_7x8__avx_broadcast, 7, 8, 1, 1);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_7x8__avx_broadcast, 7, 8, 1, 1, benchmark::utils::CheckAVX);
   }
 
   static void f32_gemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1, benchmark::utils::CheckFMA3);
   }
 
   static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1, benchmark::utils::CheckFMA3);
   }
 
   static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1, benchmark::utils::CheckFMA3);
   }
 
   static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1, benchmark::utils::CheckFMA3);
   }
 
   static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1, benchmark::utils::CheckFMA3);
   }
 
   static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1);
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1, benchmark::utils::CheckFMA3);
   }
 
   BENCHMARK_GEMM(f32_gemm_1x8__sse_load1)
diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc
index fc82032..197f758 100644
--- a/bench/f32-igemm.cc
+++ b/bench/f32-igemm.cc
@@ -27,11 +27,15 @@
 
 static void IGEMMBenchmark(benchmark::State& state,
   xnn_f32_igemm_ukernel_function f32_igemm,
-  uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr)
+  uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
 {
   if (!cpuinfo_initialize()) {
     state.SkipWithError("cpuinfo initialization failed");
   }
+  if (isa_check && !isa_check(state)) {
+    return;
+  }
 
   const size_t input_height = state.range(0);
   const size_t input_width = state.range(1);
@@ -152,71 +156,71 @@
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   static void f32_igemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1, benchmark::utils::CheckNEON);
   }
   static void f32_igemm_4x2__neon_lane_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x2__neon_lane_ld64, 4, 2, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x2__neon_lane_ld64, 4, 2, 1, 1, benchmark::utils::CheckNEON);
   }
   static void f32_igemm_4x4__neon_lane_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x4__neon_lane_ld64, 4, 4, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x4__neon_lane_ld64, 4, 4, 1, 1, benchmark::utils::CheckNEON);
   }
   static void f32_igemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1, benchmark::utils::CheckNEON);
   }
   static void f32_igemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEON);
   }
 
   static void f32_igemm_1x8__neon_dup_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neon_dup_ld64, 1, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neon_dup_ld64, 1, 8, 1, 1, benchmark::utils::CheckNEON);
   }
   static void f32_igemm_4x8__neon_dup_ld128(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_dup_ld128, 4, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_dup_ld128, 4, 8, 1, 1, benchmark::utils::CheckNEON);
   }
   static void f32_igemm_4x8__neon_dup_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_dup_ld64, 4, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_dup_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEON);
   }
   static void f32_igemm_6x8__neon_dup_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neon_dup_ld64, 6, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neon_dup_ld64, 6, 8, 1, 1, benchmark::utils::CheckNEON);
   }
   static void f32_igemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1, benchmark::utils::CheckNEONFMA);
   }
   static void f32_igemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1, benchmark::utils::CheckNEONFMA);
   }
   static void f32_igemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEONFMA);
   }
   static void f32_igemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1, benchmark::utils::CheckNEONFMA);
   }
   static void f32_igemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1, benchmark::utils::CheckNEON);
   }
   static void f32_igemm_1x8s4__neon(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8s4__neon, 1, 8, 1, 4);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8s4__neon, 1, 8, 1, 4, benchmark::utils::CheckNEON);
   }
   static void f32_igemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8s4__neonfma, 1, 8, 1, 4);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8s4__neonfma, 1, 8, 1, 4, benchmark::utils::CheckNEONFMA);
   }
   static void f32_igemm_4x8s4__neon(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8s4__neon, 4, 8, 1, 4);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8s4__neon, 4, 8, 1, 4, benchmark::utils::CheckNEON);
   }
   static void f32_igemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8s4__neonfma, 4, 8, 1, 4);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8s4__neonfma, 4, 8, 1, 4, benchmark::utils::CheckNEONFMA);
   }
   static void f32_igemm_6x8s4__neon(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8s4__neon, 6, 8, 1, 4);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8s4__neon, 6, 8, 1, 4, benchmark::utils::CheckNEON);
   }
   static void f32_igemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8s4__neonfma, 6, 8, 1, 4);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8s4__neonfma, 6, 8, 1, 4, benchmark::utils::CheckNEONFMA);
   }
   static void f32_igemm_8x8s4__neon(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_8x8s4__neon, 8, 8, 1, 4);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_8x8s4__neon, 8, 8, 1, 4, benchmark::utils::CheckNEON);
   }
   static void f32_igemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_8x8s4__neonfma, 8, 8, 1, 4);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_8x8s4__neonfma, 8, 8, 1, 4, benchmark::utils::CheckNEONFMA);
   }
 
   BENCHMARK_CONV(f32_igemm_1x8__neon_lane_ld64)
@@ -362,47 +366,47 @@
   }
 
   static void f32_igemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__avx_broadcast, 1, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__avx_broadcast, 1, 8, 1, 1, benchmark::utils::CheckAVX);
   }
 
   static void f32_igemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__avx_broadcast, 4, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__avx_broadcast, 4, 8, 1, 1, benchmark::utils::CheckAVX);
   }
 
   static void f32_igemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_5x8__avx_broadcast, 5, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_5x8__avx_broadcast, 5, 8, 1, 1, benchmark::utils::CheckAVX);
   }
 
   static void f32_igemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__avx_broadcast, 6, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__avx_broadcast, 6, 8, 1, 1, benchmark::utils::CheckAVX);
   }
 
   static void f32_igemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_7x8__avx_broadcast, 7, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_7x8__avx_broadcast, 7, 8, 1, 1, benchmark::utils::CheckAVX);
   }
 
   static void f32_igemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1, benchmark::utils::CheckFMA3);
   }
 
   static void f32_igemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1, benchmark::utils::CheckFMA3);
   }
 
   static void f32_igemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1, benchmark::utils::CheckFMA3);
   }
 
   static void f32_igemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1, benchmark::utils::CheckFMA3);
   }
 
   static void f32_igemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1, benchmark::utils::CheckFMA3);
   }
 
   static void f32_igemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1);
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1, benchmark::utils::CheckFMA3);
   }
 
   BENCHMARK_CONV(f32_igemm_1x8__sse_load1)
diff --git a/bench/utils.cc b/bench/utils.cc
index 0f72c4b..9dc85d7 100644
--- a/bench/utils.cc
+++ b/bench/utils.cc
@@ -255,5 +255,62 @@
   }
 }
 
+
+bool CheckNEON(benchmark::State& state) {
+  if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon()) {
+    state.SkipWithError("no NEON extension");
+    return false;
+  }
+  return true;
+}
+
+bool CheckNEONFMA(benchmark::State& state) {
+  if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fma()) {
+    state.SkipWithError("no NEON-FMA extension");
+    return false;
+  }
+  return true;
+}
+
+bool CheckSSE41(benchmark::State& state) {
+  if (!cpuinfo_initialize() || !cpuinfo_has_x86_sse4_1()) {
+    state.SkipWithError("no SSE4.1 extension");
+    return false;
+  }
+  return true;
+}
+
+bool CheckAVX(benchmark::State& state) {
+  if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx()) {
+    state.SkipWithError("no AVX extension");
+    return false;
+  }
+  return true;
+}
+
+bool CheckFMA3(benchmark::State& state) {
+  if (!cpuinfo_initialize() || !cpuinfo_has_x86_fma3()) {
+    state.SkipWithError("no FMA3 extension");
+    return false;
+  }
+  return true;
+}
+
+bool CheckAVX2(benchmark::State& state) {
+  if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx2()) {
+    state.SkipWithError("no AVX2 extension");
+    return false;
+  }
+  return true;
+}
+
+bool CheckAVX512F(benchmark::State& state) {
+  if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx512f()) {
+    state.SkipWithError("no AVX512F extension");
+    return false;
+  }
+  return true;
+}
+
 }  // namespace utils
 }  // namespace benchmark
diff --git a/bench/utils.h b/bench/utils.h
index 8f21af6..072b14d 100644
--- a/bench/utils.h
+++ b/bench/utils.h
@@ -29,6 +29,36 @@
 // Set multi-threading parameters appropriate for the processor.
 void MultiThreadingParameters(benchmark::internal::Benchmark* benchmark);
 
+typedef bool (*IsaCheckFunction)(benchmark::State& state);
+
+// Check if ARM NEON extension is supported.
+// If NEON is unsupported, report error in benchmark state, and return false.
+bool CheckNEON(benchmark::State& state);
+
+// Check if ARM NEON-FMA extension is supported.
+// If NEON-FMA is unsupported, report error in benchmark state, and return false.
+bool CheckNEONFMA(benchmark::State& state);
+
+// Check if x86 SSE4.1 extension is supported.
+// If SSE4.1 is unsupported, report error in benchmark state, and return false.
+bool CheckSSE41(benchmark::State& state);
+
+// Check if x86 AVX extension is supported.
+// If AVX is unsupported, report error in benchmark state, and return false.
+bool CheckAVX(benchmark::State& state);
+
+// Check if x86 FMA3 extension is supported.
+// If FMA3 is unsupported, report error in benchmark state, and return false.
+bool CheckFMA3(benchmark::State& state);
+
+// Check if x86 AVX2 extension is supported.
+// If AVX2 is unsupported, report error in benchmark state, and return false.
+bool CheckAVX2(benchmark::State& state);
+
+// Check if x86 AVX512F extension is supported.
+// If AVX512F is unsupported, report error in benchmark state, and return false.
+bool CheckAVX512F(benchmark::State& state);
+
 template <class T>
 inline T DivideRoundUp(T x, T q) {
   return x / q + T(x % q != 0);
