Limit direct dependencies on cpuinfo

PiperOrigin-RevId: 272245408
diff --git a/bench/f16-gemm.cc b/bench/f16-gemm.cc
index ece4b7b..e107813 100644
--- a/bench/f16-gemm.cc
+++ b/bench/f16-gemm.cc
@@ -20,6 +20,7 @@
 #include "bench/gemm.h"
 #include "bench/utils.h"
 #include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
 #include <xnnpack/pack.h>
 #include <xnnpack/params.h>
@@ -99,7 +100,7 @@
     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
 }
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   static void hgemm_4x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f16_gemm_ukernel_4x8__neonfp16arith_ld64, 4, 8, 1);
   }
diff --git a/bench/f32-conv-hwc.cc b/bench/f32-conv-hwc.cc
index f158623..8535b71 100644
--- a/bench/f32-conv-hwc.cc
+++ b/bench/f32-conv-hwc.cc
@@ -17,6 +17,7 @@
 #include "bench/utils.h"
 #include <xnnpack/AlignedAllocator.h>
 #include <xnnpack/conv.h>
+#include <xnnpack/common.h>
 #include <xnnpack/pack.h>
 #include <xnnpack/params.h>
 #include <xnnpack/requantization.h>
@@ -108,7 +109,7 @@
     benchmark::Counter::kIsRate);
 }
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   static void f32_conv_hwc_3x3s2p1c3x8__neonfma_2x2(benchmark::State& state, const char* net) {
     DConv3X3S2P1Benchmark(state, xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2, 8);
   }
diff --git a/bench/f32-dwconv-spchw.cc b/bench/f32-dwconv-spchw.cc
index bf0ff1e..fd44b78 100644
--- a/bench/f32-dwconv-spchw.cc
+++ b/bench/f32-dwconv-spchw.cc
@@ -16,6 +16,7 @@
 #include "bench/dwconv.h"
 #include "bench/utils.h"
 #include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
 #include <xnnpack/dwconv.h>
 #include <xnnpack/indirection.h>
 #include <xnnpack/operator.h>
@@ -258,7 +259,7 @@
     benchmark::Counter::kIsRate);
 }
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   static void CHW_3x3p1__neonfma(benchmark::State& state, const char* net) {
     DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma, 4, 4, 3, 3, 1, 1);
   }
@@ -299,10 +300,10 @@
   BENCHMARK_DWCONV(HWo4C4_5x5p2__neonfma)
   BENCHMARK_DWCONV(HWo4C4_3x3s2p1__neonfma)
   BENCHMARK_DWCONV(HWo4C4_5x5s2p2__neonfma)
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   static void CHW_3x3p1__sse(benchmark::State& state, const char* net) {
     DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3p1__sse, 4, 4, 3, 3, 1, 1);
   }
@@ -323,7 +324,7 @@
   BENCHMARK_DWCONV(CHW_3x3s2p1__sse)
   BENCHMARK_DWCONV(HWo4C4_3x3p1__sse)
   BENCHMARK_DWCONV(HWo4C4_3x3s2p1__sse)
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #ifndef XNNPACK_BENCHMARK_NO_MAIN
diff --git a/bench/f32-dwconv.cc b/bench/f32-dwconv.cc
index 1e44836..d3967b0 100644
--- a/bench/f32-dwconv.cc
+++ b/bench/f32-dwconv.cc
@@ -16,6 +16,7 @@
 #include "bench/dwconv.h"
 #include "bench/utils.h"
 #include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
 #include <xnnpack/dwconv.h>
 #include <xnnpack/indirection.h>
 #include <xnnpack/operator.h>
@@ -147,7 +148,7 @@
     benchmark::Counter::kIsRate);
 }
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   static void f32_dwconv_4x9__aarch64_neonfma(benchmark::State& state, const char* net) {
     DWConvBenchmark(state, xnn_f32_dwconv_ukernel_up4x9__neon, 4, 9);
   }
@@ -158,10 +159,10 @@
 
   BENCHMARK_DWCONV(f32_dwconv_4x9__aarch64_neonfma)
   BENCHMARK_DWCONV(f32_dwconv_4x9__aarch64_neonfma_cortex_a55)
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   static void f32_dwconv_4x9__neon(benchmark::State& state, const char* net) {
     DWConvBenchmark(state, xnn_f32_dwconv_ukernel_up4x9__neon, 4, 9);
   }
@@ -177,10 +178,10 @@
   BENCHMARK_DWCONV(f32_dwconv_4x9__neon)
   BENCHMARK_DWCONV(f32_dwconv_4x9__neonfma)
   BENCHMARK_DWCONV(f32_dwconv_8x9__neonfma)
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   static void f32_dwconv_4x4__sse(benchmark::State& state, const char* net) {
     DWConvBenchmark(state, xnn_f32_dwconv_ukernel_up4x4__sse, 4, 4);
   }
@@ -196,10 +197,10 @@
   BENCHMARK_DWCONV(f32_dwconv_4x4__sse)
   BENCHMARK_DWCONV(f32_dwconv_4x9__sse)
   BENCHMARK_DWCONV(f32_dwconv_4x25__sse)
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
   static void f32_dwconv_4x4__psimd(benchmark::State& state, const char* net) {
     DWConvBenchmark(state, xnn_f32_dwconv_ukernel_up4x4__psimd, 4, 4);
   }
@@ -215,7 +216,7 @@
   BENCHMARK_DWCONV(f32_dwconv_4x4__psimd)
   BENCHMARK_DWCONV(f32_dwconv_4x9__psimd)
   BENCHMARK_DWCONV(f32_dwconv_4x25__psimd)
-#endif  // !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
 
 static void f32_dwconv_1x4__scalar(benchmark::State& state, const char* net) {
diff --git a/bench/f32-gemm.cc b/bench/f32-gemm.cc
index 8d5637f..9425971 100644
--- a/bench/f32-gemm.cc
+++ b/bench/f32-gemm.cc
@@ -22,6 +22,7 @@
 #include "bench/gemm.h"
 #include "bench/utils.h"
 #include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
 #include <xnnpack/pack.h>
 #include <xnnpack/packx.h>
@@ -330,7 +331,7 @@
 }
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   static void sgemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1);
   }
@@ -388,9 +389,9 @@
   BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_cortex_a75)
   BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_ld64)
   BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_ld128)
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   static void sgemm_4x12__neon_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x12__neon_ld64, 4, 12, 1, 1);
   }
@@ -462,9 +463,9 @@
 
   BENCHMARK_GEMM(sppmm_4x8_unipass__neonfma)
   BENCHMARK_GEMM(sppmm_4x8_twopass__neonfma)
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   static void sgemm_1x8__sse_load1(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__sse_load1, 1, 8, 1, 1);
   }
@@ -505,9 +506,9 @@
   BENCHMARK_GEMM(sgemm_4x8s4__sse)
   BENCHMARK_GEMM(sppmm_4x8_unipass__sse)
   BENCHMARK_GEMM(sppmm_4x8_twopass__sse)
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
-#if !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
   static void sgemm_4x8__psimd_loadsplat(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, 4, 8, 1, 1);
   }
@@ -548,7 +549,7 @@
   BENCHMARK_GEMM(sgemm_6x8s4__psimd)
   BENCHMARK_GEMM(sppmm_4x8_unipass__psimd)
   BENCHMARK_GEMM(sppmm_4x8_twopass__psimd)
-#endif  // !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
 static void sgemm_1x4__scalar(benchmark::State& state, const char* net) {
   GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x4__scalar, 1, 4, 1, 1);
diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc
index cd87bb7..0e371a6 100644
--- a/bench/f32-igemm.cc
+++ b/bench/f32-igemm.cc
@@ -16,6 +16,7 @@
 #include "bench/conv.h"
 #include "bench/utils.h"
 #include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
 #include <xnnpack/indirection.h>
 #include <xnnpack/operator.h>
@@ -149,7 +150,7 @@
     benchmark::Counter::kIsRate);
 }
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   static void f32_igemm_4x2__neon_ld64(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x2__neon_ld64, 4, 2, 1, 1);
   }
@@ -212,7 +213,7 @@
   BENCHMARK_CONV(f32_igemm_6x8__neonfma_ld64)
 #endif
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   static void f32_igemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1);
   }
@@ -258,9 +259,9 @@
   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a57)
   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a73)
   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a75)
-#endif  /* CPUINFO_ARCH_ARM64 */
+#endif  /* XNN_ARCH_ARM64 */
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   static void f32_igemm_1x8__sse_load1(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__sse_load1, 1, 8, 1, 1);
   }
@@ -291,9 +292,9 @@
   BENCHMARK_CONV(f32_igemm_4x8__sse_dup)
   BENCHMARK_CONV(f32_igemm_1x8s4__sse)
   BENCHMARK_CONV(f32_igemm_4x8s4__sse)
-#endif  /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#endif  /* XNN_ARCH_X86 || XNN_ARCH_X86_64 */
 
-#if !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
   static void f32_igemm_1x8__psimd_loadsplat(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, 1, 8, 1, 1);
   }
@@ -341,7 +342,7 @@
   BENCHMARK_CONV(f32_igemm_1x8s4__psimd)
   BENCHMARK_CONV(f32_igemm_4x8s4__psimd)
   BENCHMARK_CONV(f32_igemm_6x8s4__psimd)
-#endif /* !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS */
+#endif /* !XNN_ARCH_WASM && !XNN_ARCH_ASMJS */
 
 static void f32_igemm_1x4__scalar(benchmark::State& state, const char* net) {
   IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x4__scalar, 1, 4, 1, 1);
diff --git a/bench/f32-im2col-gemm.cc b/bench/f32-im2col-gemm.cc
index ff11499..be4444a 100644
--- a/bench/f32-im2col-gemm.cc
+++ b/bench/f32-im2col-gemm.cc
@@ -16,6 +16,7 @@
 #include "bench/conv.h"
 #include "bench/utils.h"
 #include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
 #include <xnnpack/im2col.h>
 #include <xnnpack/pack.h>
@@ -133,21 +134,21 @@
 }
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   static void sgemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
     Im2ColGEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1);
   }
 
   BENCHMARK_CONV(sgemm_4x8__aarch64_neonfma_cortex_a75)
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
-#if !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
   static void sgemm_6x8__psimd_loadsplat(benchmark::State& state, const char* net) {
     Im2ColGEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, 6, 8, 1, 1);
   }
 
   BENCHMARK_CONV(sgemm_6x8__psimd_loadsplat)
-#endif  // !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
 static void sgemm_2x4__scalar(benchmark::State& state, const char* net) {
   Im2ColGEMMBenchmark(state, xnn_f32_gemm_ukernel_2x4__scalar, 2, 4, 1, 1);
diff --git a/bench/f32-rmax.cc b/bench/f32-rmax.cc
index 0fe2cc3..696144b 100644
--- a/bench/f32-rmax.cc
+++ b/bench/f32-rmax.cc
@@ -14,6 +14,7 @@
 #include <benchmark/benchmark.h>
 #include "bench/utils.h"
 #include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
 #include <xnnpack/params.h>
 #include <xnnpack/rmax.h>
 
@@ -45,7 +46,7 @@
     benchmark::Counter(uint64_t(state.iterations()) * n * sizeof(float), benchmark::Counter::kIsRate);
 }
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   BENCHMARK_CAPTURE(f32_rmax, sse, xnn_f32_rmax_ukernel__sse)
     ->RangeMultiplier(10)
     ->Range(1000, 100000000)
@@ -60,14 +61,14 @@
     ->RangeMultiplier(10)
     ->Range(1000, 100000000)
     ->UseRealTime();
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   BENCHMARK_CAPTURE(f32_rmax, neon, xnn_f32_rmax_ukernel__neon)
     ->RangeMultiplier(10)
     ->Range(1000, 100000000)
     ->UseRealTime();
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 BENCHMARK_CAPTURE(f32_rmax, scalar, xnn_f32_rmax_ukernel__scalar)
   ->RangeMultiplier(10)
   ->Range(1000, 100000000)
diff --git a/bench/f32-spmm.cc b/bench/f32-spmm.cc
index 11c071a..4d2a9da 100644
--- a/bench/f32-spmm.cc
+++ b/bench/f32-spmm.cc
@@ -16,6 +16,7 @@
 #include "bench/gemm.h"
 #include "bench/utils.h"
 #include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
 #include <xnnpack/params.h>
 #include <xnnpack/requantization.h>
 #include <xnnpack/spmm.h>
@@ -158,7 +159,7 @@
 }
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   static void spmm80_4x1__neonfma(benchmark::State& state, const char* net) {
     SpMMBenchmark(state, xnn_f32_spmm_ukernel_4x1__neonfma, 4, 1, 0.8f);
   }
@@ -247,9 +248,9 @@
   BENCHMARK_GEMM(spmm80_4x1__neonfma_pipelined)
   BENCHMARK_GEMM(spmm80_8x1__neonfma_pipelined)
   BENCHMARK_GEMM(spmm80_16x1__neonfma_pipelined)
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   static void spmm80_4x1__sse(benchmark::State& state, const char* net) {
     SpMMBenchmark(state, xnn_f32_spmm_ukernel_4x1__sse, 4, 1, 0.8f);
   }
@@ -260,7 +261,7 @@
 
   BENCHMARK_GEMM(spmm80_4x1__sse)
   BENCHMARK_GEMM(spmm80_8x1__sse)
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 static void spmm80_1x1__scalar(benchmark::State& state, const char* net) {
   SpMMBenchmark(state, xnn_f32_spmm_ukernel_1x1__scalar, 1, 1, 0.8f);
diff --git a/bench/q8-gemm.cc b/bench/q8-gemm.cc
index 7382fc3..9a15006 100644
--- a/bench/q8-gemm.cc
+++ b/bench/q8-gemm.cc
@@ -23,6 +23,7 @@
 #include "bench/gemm.h"
 #include "bench/utils.h"
 #include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
 #include <xnnpack/pack.h>
 #include <xnnpack/params.h>
@@ -284,7 +285,7 @@
 }
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   static void q8gemm_4x8__neon(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_q8_gemm_ukernel_4x8__neon, 4, 8, 1);
   }
@@ -297,7 +298,7 @@
   BENCHMARK_GEMM(q8gemm_8x8__neon)
 #endif
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   static void q8gemm_4x4c2__sse2(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_q8_gemm_ukernel_4x4c2__sse2, 4, 4, 2);
   }
diff --git a/bench/requantization.cc b/bench/requantization.cc
index 25e73d9..d2db0b0 100644
--- a/bench/requantization.cc
+++ b/bench/requantization.cc
@@ -18,6 +18,7 @@
 #include <benchmark/benchmark.h>
 #include "bench/utils.h"
 #include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
 #include <xnnpack/requantization-stubs.h>
 
 inline uint32_t divideRoundUp(uint32_t x, uint32_t q)
@@ -151,7 +152,7 @@
   }
 }
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
 BENCHMARK_F(Requantization, precise__neon)(benchmark::State& state)
 {
   for (auto _ : state) {
@@ -185,7 +186,7 @@
 }
 #endif
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
 BENCHMARK_F(Requantization, precise__sse2)(benchmark::State& state)
 {
   for (auto _ : state) {
diff --git a/src/init.c b/src/init.c
index b0ad23d..a5ca59d 100644
--- a/src/init.c
+++ b/src/init.c
@@ -18,6 +18,7 @@
 #include <xnnpack/argmaxpool.h>
 #include <xnnpack/avgpool.h>
 #include <xnnpack/clamp.h>
+#include <xnnpack/common.h>
 #include <xnnpack/conv.h>
 #include <xnnpack/dwconv.h>
 #include <xnnpack/gavgpool.h>
@@ -48,15 +49,15 @@
   .initialized = false
 };
 
-#if CPUINFO_ARCH_PNACL || CPUINFO_ARCH_ASMJS || CPUINFO_ARCH_WASM || CPUINFO_ARCH_WASMSIMD
+#if XNN_ARCH_PNACL || XNN_ARCH_ASMJS || XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   extern uint32_t xnn_stub_wasm_f32_sub(uint32_t a, uint32_t b);
 #endif
-#if CPUINFO_ARCH_PNACL || CPUINFO_ARCH_WASM || CPUINFO_ARCH_WASMSIMD
+#if XNN_ARCH_PNACL || XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   extern uint32_t xnn_stub_wasm_f32_min(uint32_t a, uint32_t b);
 #endif
 
 static void init(void) {
-#if CPUINFO_ARCH_ARM
+#if XNN_ARCH_ARM
   if (!cpuinfo_has_arm_neon()) {
     xnn_log_error("XNNPACK initialization failed: NEON is not supported");
     return;
@@ -206,7 +207,7 @@
     .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
   };
 
-#elif CPUINFO_ARCH_ARM64
+#elif XNN_ARCH_ARM64
 
   /**************************** Q8 micro-kernels ****************************/
   xnn_params.q8.gemm = (struct gemm_parameters) {
@@ -485,7 +486,7 @@
     .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
   };
 
-#elif CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#elif XNN_ARCH_X86 || XNN_ARCH_X86_64
   if (!cpuinfo_has_x86_sse2()) {
     xnn_log_error("XNNPACK initialization failed: SSE2 is not supported");
     return;
@@ -649,7 +650,7 @@
     .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__sse2,
   };
 
-#elif CPUINFO_ARCH_PNACL || CPUINFO_ARCH_WASMSIMD
+#elif XNN_ARCH_PNACL || XNN_ARCH_WASMSIMD
   /**************************** Q8 micro-kernels ****************************/
   xnn_params.q8.gemm = (struct gemm_parameters) {
     .gemm = (xnn_gemm_ukernel_function) xnn_q8_gemm_ukernel_2x2__scalar,
@@ -786,7 +787,7 @@
     .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__psimd,
   };
 
-#elif CPUINFO_ARCH_WASM || CPUINFO_ARCH_ASMJS
+#elif XNN_ARCH_WASM || XNN_ARCH_ASMJS
   // Unlike most other architectures, on x86/x86-64 when floating-point instructions
   // have no NaN arguments, but produce NaN output, the output NaN has sign bit set.
   // We use it to distinguish x86/x86-64 from other architectures, by doing subtraction
diff --git a/src/wasm-stubs.c b/src/wasm-stubs.c
index 29826d7..b9273c5 100644
--- a/src/wasm-stubs.c
+++ b/src/wasm-stubs.c
@@ -12,8 +12,8 @@
   return fp32_to_bits(fp32_from_bits(a) - fp32_from_bits(b));
 }
 
-#if CPUINFO_ARCH_WASM || CPUINFO_ARCH_WASMSIMD
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 uint32_t xnn_stub_wasm_f32_min(uint32_t a, uint32_t b) {
   return fp32_to_bits(__builtin_wasm_min_f32(fp32_from_bits(a), fp32_from_bits(b)));
 }
-#endif  // CPUINFO_ARCH_WASM || CPUINFO_ARCH_WASMSIMD
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
diff --git a/src/xnnpack/allocator.h b/src/xnnpack/allocator.h
index 303aa37..c946656 100644
--- a/src/xnnpack/allocator.h
+++ b/src/xnnpack/allocator.h
@@ -12,7 +12,7 @@
   #include <malloc.h>
 #endif
 
-#include <cpuinfo.h>
+#include <xnnpack/common.h>
 
 extern int posix_memalign(void **memptr, size_t alignment, size_t size);
 
@@ -22,7 +22,7 @@
 
 inline static void* xnn_allocate_memory(size_t memory_size) {
   void* memory_ptr = NULL;
-#if CPUINFO_ARCH_ASMJS || CPUINFO_ARCH_WASM
+#if XNN_ARCH_ASMJS || XNN_ARCH_WASM
   memory_ptr = malloc(memory_size);
 #elif defined(__ANDROID__)
   memory_ptr = memalign(XNN_ALLOCATION_ALIGNMENT, memory_size);
diff --git a/src/xnnpack/common.h b/src/xnnpack/common.h
index 2dbc451..673b92a 100644
--- a/src/xnnpack/common.h
+++ b/src/xnnpack/common.h
@@ -9,6 +9,83 @@
 #pragma once
 
 
+// Define architecture indentification macros
+
+#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(_M_IX86)
+  #define XNN_ARCH_X86 1
+#endif
+
+#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
+  #define XNN_ARCH_X86_64 1
+#endif
+
+#if defined(__arm__) || defined(_M_ARM)
+  #define XNN_ARCH_ARM 1
+#endif
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  #define XNN_ARCH_ARM64 1
+#endif
+
+#if defined(__PPC64__) || defined(__ppc64__) || defined(__powerpc64__) || defined(_ARCH_PPC64)
+  #define XNN_ARCH_PPC64 1
+#endif
+
+#if defined(__pnacl__)
+  #define XNN_ARCH_PNACL 1
+#endif
+
+#if defined(__asmjs__)
+  #define XNN_ARCH_ASMJS 1
+#endif
+
+#if defined(__wasm__)
+  #if defined(__wasm_simd128__)
+    #define XNN_ARCH_WASMSIMD 1
+  #else
+    #define XNN_ARCH_WASM 1
+  #endif
+#endif
+
+// Ensure each architecture indentification macro is always defined, as either 0 or 1
+
+#ifndef XNN_ARCH_X86
+  #define XNN_ARCH_X86 0
+#endif
+
+#ifndef XNN_ARCH_X86_64
+  #define XNN_ARCH_X86_64 0
+#endif
+
+#ifndef XNN_ARCH_ARM
+  #define XNN_ARCH_ARM 0
+#endif
+
+#ifndef XNN_ARCH_ARM64
+  #define XNN_ARCH_ARM64 0
+#endif
+
+#ifndef XNN_ARCH_PPC64
+  #define XNN_ARCH_PPC64 0
+#endif
+
+#ifndef XNN_ARCH_PNACL
+  #define XNN_ARCH_PNACL 0
+#endif
+
+#ifndef XNN_ARCH_ASMJS
+  #define XNN_ARCH_ASMJS 0
+#endif
+
+#ifndef XNN_ARCH_WASM
+  #define XNN_ARCH_WASM 0
+#endif
+
+#ifndef XNN_ARCH_WASMSIMD
+  #define XNN_ARCH_WASMSIMD 0
+#endif
+
+
 #if defined(__GNUC__)
   #if defined(__clang__) || (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 5)
     #define XNN_UNREACHABLE do { __builtin_unreachable(); } while (0)
diff --git a/src/xnnpack/isa-checks.h b/src/xnnpack/isa-checks.h
index 0bdf97c..e4aebd6 100644
--- a/src/xnnpack/isa-checks.h
+++ b/src/xnnpack/isa-checks.h
@@ -10,8 +10,10 @@
 
 #include <cpuinfo.h>
 
+#include <xnnpack/common.h>
 
-#if CPUINFO_ARCH_PNACL || CPUINFO_ARCH_WASMSIMD
+
+#if XNN_ARCH_PNACL || XNN_ARCH_WASMSIMD
   #define TEST_REQUIRES_PSIMD
 #else
   #define TEST_REQUIRES_PSIMD \
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 11065c4..7313032 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -12,8 +12,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <cpuinfo.h>
-
 #include <xnnpack/common.h>
 
 #define XNN_INTERNAL_EXTRA_BYTES 32
@@ -29,12 +27,12 @@
     float max;
     float min;
   } scalar;
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float max[4];
     XNN_ALIGN(16) float min[4];
   } sse;
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };
 
 union xnn_f32_spchw_params {
@@ -42,7 +40,7 @@
     float max;
     float min;
   } scalar;
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels
     XNN_ALIGN(16) uint32_t mask_odd[4];  // used by stride 2 kernels
@@ -50,8 +48,8 @@
     float min;
     float max;
   } neon;
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels
     XNN_ALIGN(16) uint32_t mask_odd[4];  // used by stride 2 kernels
@@ -59,7 +57,7 @@
     XNN_ALIGN(16) float max[4];
     XNN_ALIGN(16) float min[4];
   } sse;
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };
 
 union xnn_u8_output_params {
@@ -67,18 +65,18 @@
     int32_t max;
     int32_t min;
   } scalar;
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     uint8_t max;
     uint8_t min;
   } neon;
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) uint8_t max[16];
     XNN_ALIGN(16) uint8_t min[16];
   } sse2;
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };
 
 union xnn_f32_avgpool_params {
@@ -87,20 +85,20 @@
     float output_min;
     float output_max;
   } scalar;
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float multiplier[4];
     XNN_ALIGN(16) float output_max[4];
     XNN_ALIGN(16) float output_min[4];
   } sse2;
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     XNN_ALIGN(16) float multiplier;
     XNN_ALIGN(16) float output_max;
     XNN_ALIGN(16) float output_min;
   } neon;
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 };
 
 union xnn_f32_gavgpool_params {
@@ -109,22 +107,22 @@
     float output_min;
     float output_max;
   } scalar;
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float multiplier[4];
     XNN_ALIGN(16) float output_max[4];
     XNN_ALIGN(16) float output_min[4];
     XNN_ALIGN(16) uint32_t mask[4];
   } sse;
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     XNN_ALIGN(16) float multiplier;
     XNN_ALIGN(16) float output_max;
     XNN_ALIGN(16) float output_min;
     XNN_ALIGN(16) uint32_t mask[4];
   } neon;
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64 */
 };
 
 union xnn_f32_hswish_params {
@@ -133,13 +131,13 @@
     float half;
     float one;
   } scalar;
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float sixth[4];
     XNN_ALIGN(16) float half[4];
     XNN_ALIGN(16) float one[4];
   } sse;
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };
 
 union xnn_q8_gemm_params {
@@ -154,7 +152,7 @@
     int32_t output_max_less_zero_point;
     int32_t output_zero_point;
   } scalar;
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     int16_t kernel_zero_point;
     int16_t input_zero_point;
@@ -164,8 +162,8 @@
     uint8_t output_max;
     uint8_t output_min;
   } neon;
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int16_t kernel_zero_point[8];
     XNN_ALIGN(16) int16_t input_zero_point[8];
@@ -178,7 +176,7 @@
     XNN_ALIGN(16) uint8_t output_max[16];
     XNN_ALIGN(16) uint8_t output_min[16];
   } sse2;
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };
 
 union xnn_q8_add_params {
@@ -193,7 +191,7 @@
     int32_t y_max;
     int32_t y_min;
   } scalar;
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     uint8_t a_zero_point;
     uint8_t b_zero_point;
@@ -204,8 +202,8 @@
     uint8_t y_max;
     uint8_t y_min;
   } neon;
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int32_t zero_point_product[4];
     XNN_ALIGN(16) uint16_t a_multiplier_lo[8];
@@ -221,7 +219,7 @@
     uint32_t a_multiplier;
     uint32_t b_multiplier;
   } sse2;
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };
 
 union xnn_q8_avgpool_params {
@@ -234,7 +232,7 @@
     int32_t output_max_less_zero_point;
     int32_t output_zero_point;
   } scalar;
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     int32_t bias;
     int32_t multiplier;
@@ -243,8 +241,8 @@
     uint8_t output_max;
     uint8_t output_min;
   } neon;
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int32_t bias[4];
     XNN_ALIGN(16) uint32_t multiplier[4];
@@ -254,7 +252,7 @@
     XNN_ALIGN(16) uint8_t output_max[16];
     XNN_ALIGN(16) uint8_t output_min[16];
   } sse2;
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };
 
 union xnn_fp32_requantization_params {
@@ -330,7 +328,7 @@
     int32_t max_less_zero_point;
     int32_t zero_point;
   } scalar;
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     int32_t multiplier;
     int32_t right_shift;
@@ -338,8 +336,8 @@
     uint8_t max;
     uint8_t min;
   } neon;
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) uint32_t multiplier[4];
     XNN_ALIGN(16) uint64_t rounding[2];
@@ -350,7 +348,7 @@
     XNN_ALIGN(16) uint8_t max[16];
     XNN_ALIGN(16) uint8_t min[16];
   } sse2;
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };
 
 union xnn_requantization_params {
diff --git a/src/xnnpack/requantization.h b/src/xnnpack/requantization.h
index 51cff74..77f771a 100644
--- a/src/xnnpack/requantization.h
+++ b/src/xnnpack/requantization.h
@@ -22,6 +22,7 @@
 
 #include <fp16.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/params.h>
 #include <xnnpack/scalar-utils.h>
 
@@ -87,7 +88,7 @@
   assert(shift < 32);
 
   union xnn_q8_gemm_params params;
-  #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
     const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
     const uint32_t remainder_threshold = remainder_mask >> 1;
     for (uint32_t i = 0; i < 8; i++) {
@@ -117,7 +118,7 @@
       params.sse2.output_max[i] = output_max;
       params.sse2.output_min[i] = output_min;
     }
-  #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+  #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
     params.neon.input_zero_point = (int16_t) (uint16_t) input_zero_point;
     params.neon.kernel_zero_point = (int16_t) (uint16_t) kernel_zero_point;
     params.neon.multiplier = multiplier;
@@ -166,7 +167,7 @@
   assert(shift < 64);
 
   union xnn_q8_avgpool_params params;
-  #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
     const uint32_t right_shift = (uint32_t) shift;
     const uint64_t rounding = UINT64_C(1) << (right_shift - 1);
     params.sse2.bias[0] = bias;
@@ -188,7 +189,7 @@
       params.sse2.output_max[i] = output_max;
       params.sse2.output_min[i] = output_min;
     }
-  #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+  #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
     params.neon.bias = bias;
     params.neon.multiplier = multiplier;
     params.neon.left_shift = (int64_t) -shift;
@@ -252,7 +253,7 @@
   union xnn_f32_avgpool_params* params,
   float multiplier)
 {
-  #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
     for (uint32_t i = 0; i < 4; i++) {
       params->sse2.multiplier[i] = multiplier;
     }
@@ -267,7 +268,7 @@
   float output_max)
 {
   union xnn_f32_avgpool_params params;
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   for (uint32_t i = 0; i < 4; i++) {
     params.sse2.multiplier[i] = multiplier;
     params.sse2.output_min[i] = output_min;
@@ -288,7 +289,7 @@
   uint32_t width)
 {
   union xnn_f32_gavgpool_params params;
-  #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
     for (uint32_t i = 0; i < 4; i++) {
       params.sse.multiplier[i] = multiplier;
       params.sse.output_min[i] = output_min;
@@ -320,7 +321,7 @@
       params.sse.mask[3] = 0;
       break;
   }
-#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#elif XNN_ARCH_ARM || XNN_ARCH_ARM64
     switch (width % 4) {
       case 0:
         params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
@@ -363,7 +364,7 @@
   float multiplier,
   uint32_t width)
 {
-  #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
     for (uint32_t i = 0; i < 4; i++) {
       params->sse.multiplier[i] = multiplier;
     }
@@ -393,7 +394,7 @@
         params->sse.mask[3] = 0;
         break;
     }
-  #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+  #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
     params->neon.multiplier = multiplier;
     switch (width % 4) {
       case 0:
@@ -454,7 +455,7 @@
   float output_max)
 {
   union xnn_f32_output_params params;
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   for (uint32_t i = 0; i < 4; i++) {
     params.sse.min[i] = output_min;
     params.sse.max[i] = output_max;
@@ -479,7 +480,7 @@
 static inline union xnn_f32_hswish_params xnn_compute_f32_hswish_params(void)
 {
   union xnn_f32_hswish_params params;
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   for (uint32_t i = 0; i < 4; i++) {
     params.sse.sixth[i] = 0x1.555556p-3f;
     params.sse.half[i] = 0.5f;
@@ -508,7 +509,7 @@
   float output_max)
 {
   union xnn_f32_spchw_params params;
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   switch (width % 4) {
     case 0:
       params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
@@ -621,7 +622,7 @@
     params.sse.max[i] = output_max;
     params.sse.min[i] = output_min;
   }
-#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#elif XNN_ARCH_ARM || XNN_ARCH_ARM64
   switch (width % 4) {
     case 0:
       params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
@@ -743,7 +744,7 @@
   union xnn_f32_spchw_params* params,
   uint32_t width)
 {
-  #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
     switch (width % 4) {
       case 0:
         params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
@@ -852,7 +853,7 @@
         params->sse.mask_odd[3] = 0;
         break;
     }
-  #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+  #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
     switch (width % 4) {
       case 0:
         params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
@@ -982,12 +983,12 @@
   assert(output_min < output_max);
 
   union xnn_u8_output_params params;
-  #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
     for (uint32_t i = 0; i < 16; i++) {
       params.sse2.max[i] = output_max;
       params.sse2.min[i] = output_min;
     }
-  #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+  #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
     params.neon.max = output_max;
     params.neon.min = output_min;
   #else
@@ -1044,7 +1045,7 @@
   assert(b_multiplier < UINT32_C(0x00400000));
 
   union xnn_q8_add_params params;
-  #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
     const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
     const uint32_t remainder_threshold = remainder_mask >> 1;
     const int32_t zero_point_product =
@@ -1072,7 +1073,7 @@
       params.sse2.y_max[i] = output_max;
       params.sse2.y_min[i] = output_min;
     }
-  #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+  #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
     params.neon.a_zero_point = a_zero_point;
     params.neon.b_zero_point = b_zero_point;
     params.neon.y_zero_point = (int16_t) (uint16_t) output_zero_point;
@@ -1200,7 +1201,7 @@
   assert(shift < 32);
 
   union xnn_q31_requantization_params params;
-  #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
     const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
     const uint32_t remainder_threshold = remainder_mask >> 1;
     params.sse2.multiplier[0] = multiplier;
@@ -1226,7 +1227,7 @@
       params.sse2.max[i] = max;
       params.sse2.min[i] = min;
     }
-  #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+  #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
     params.neon.multiplier = multiplier;
     params.neon.right_shift = -shift;
     params.neon.zero_point = (int16_t) (uint16_t) zero_point;
diff --git a/test/f16-gemm.cc b/test/f16-gemm.cc
index 725313d..09a344d 100644
--- a/test/f16-gemm.cc
+++ b/test/f16-gemm.cc
@@ -11,18 +11,18 @@
 //   Generator: tools/generate-gemm-test.py
 
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
 #include <xnnpack/gemm.h>
 #include <xnnpack/igemm.h>
 #include <xnnpack/ppmm.h>
-#include <xnnpack/isa-checks.h>
-
 #include "gemm-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F16_GEMM_4X8__NEONFP16ARITH_LD64, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FP16_ARITH;
     GemmMicrokernelTester()
@@ -475,10 +475,10 @@
       .cm_stride(11)
       .Test(xnn_f16_gemm_ukernel_4x8__neonfp16arith_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F16_GEMM_6X8__NEONFP16ARITH_LD64, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FP16_ARITH;
     GemmMicrokernelTester()
@@ -931,10 +931,10 @@
       .cm_stride(11)
       .Test(xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F16_GEMM_8X8__NEONFP16ARITH_LD64, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FP16_ARITH;
     GemmMicrokernelTester()
@@ -1387,4 +1387,4 @@
       .cm_stride(11)
       .Test(xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
diff --git a/test/f32-argmaxpool.cc b/test/f32-argmaxpool.cc
index 9ff68e3..c1a8a91 100644
--- a/test/f32-argmaxpool.cc
+++ b/test/f32-argmaxpool.cc
@@ -3,16 +3,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/argmaxpool.h>
 
+#include <xnnpack/argmaxpool.h>
 #include "argmaxpool-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_eq_4_fulltile) {
     TEST_REQUIRES_X86_SSE2;
     auto tester = ArgmaxPoolMicrokernelTester()
@@ -1321,10 +1321,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
   TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_eq_4_fulltile) {
     TEST_REQUIRES_PSIMD;
     auto tester = ArgmaxPoolMicrokernelTester()
@@ -2633,7 +2633,7 @@
       }
     }
   }
-#endif  // !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
 
 TEST(F32_ARGMAXPOOL_UP4__SCALAR, kc_eq_1_fulltile) {
diff --git a/test/f32-avgpool.cc b/test/f32-avgpool.cc
index 24186aa..4fd1e95 100644
--- a/test/f32-avgpool.cc
+++ b/test/f32-avgpool.cc
@@ -3,16 +3,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/avgpool.h>
 
+#include <xnnpack/avgpool.h>
 #include "avgpool-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_AVGPOOL_UP9__NEON, kc_eq_4_fulltile) {
     TEST_REQUIRES_ARM_NEON;
     auto tester = AvgPoolMicrokernelTester()
@@ -942,10 +942,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_AVGPOOL_UP9__SSE2, kc_eq_4_fulltile) {
     TEST_REQUIRES_X86_SSE2;
     auto tester = AvgPoolMicrokernelTester()
@@ -1875,10 +1875,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
   TEST(F32_AVGPOOL_UP9__PSIMD, kc_eq_4_fulltile) {
     TEST_REQUIRES_PSIMD;
     auto tester = AvgPoolMicrokernelTester()
@@ -2808,7 +2808,7 @@
       }
     }
   }
-#endif  // !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
 
 TEST(F32_AVGPOOL_UP9__SCALAR, kc_eq_1_fulltile) {
diff --git a/test/f32-clamp.cc b/test/f32-clamp.cc
index a1536dd..6abcc53 100644
--- a/test/f32-clamp.cc
+++ b/test/f32-clamp.cc
@@ -3,16 +3,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/clamp.h>
 
+#include <xnnpack/clamp.h>
 #include "clamp-microkernel-tester.h"
 
 
-#if !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
   TEST(F32_CLAMP__PSIMD, n_eq_4) {
     TEST_REQUIRES_PSIMD;
     ClampMicrokernelTester()
@@ -85,7 +85,7 @@
       }
     }
   }
-#endif  // !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
 
 TEST(F32_CLAMP__SCALAR, n_eq_2) {
@@ -154,7 +154,7 @@
   }
 }
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_CLAMP__NEON, n_eq_4) {
     TEST_REQUIRES_ARM_NEON;
     ClampMicrokernelTester()
@@ -227,9 +227,9 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_CLAMP__SSE, n_eq_4) {
     TEST_REQUIRES_X86_SSE2;
     ClampMicrokernelTester()
@@ -302,4 +302,4 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/test/f32-conv-hwc.cc b/test/f32-conv-hwc.cc
index b44dbe1..94179ff 100644
--- a/test/f32-conv-hwc.cc
+++ b/test/f32-conv-hwc.cc
@@ -3,16 +3,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
-#include <xnnpack/conv.h>
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
 
+#include <xnnpack/conv.h>
 #include "conv-hwc-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2, input_width_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     ConvHWCMicrokernelTester()
@@ -290,10 +290,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2, input_width_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     ConvHWCMicrokernelTester()
@@ -571,4 +571,4 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
diff --git a/test/f32-conv-hwc2spchw.cc b/test/f32-conv-hwc2spchw.cc
index 6d79a46..51530bb 100644
--- a/test/f32-conv-hwc2spchw.cc
+++ b/test/f32-conv-hwc2spchw.cc
@@ -3,16 +3,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
-#include <xnnpack/conv.h>
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
 
+#include <xnnpack/conv.h>
 #include "conv-hwc2spchw-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__NEONFMA_2X2, input_width_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     ConvHWC2SpCHWMicrokernelTester()
@@ -290,4 +290,4 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
diff --git a/test/f32-dwconv-spchw.cc b/test/f32-dwconv-spchw.cc
index 76fdcd1..914b1cc 100644
--- a/test/f32-dwconv-spchw.cc
+++ b/test/f32-dwconv-spchw.cc
@@ -3,16 +3,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
-#include <xnnpack/dwconv.h>
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
 
+#include <xnnpack/dwconv.h>
 #include "dwconv-spchw-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_DWCONV_SPCHW_3X3P1__SSE, input_width_eq_4) {
     TEST_REQUIRES_X86_SSE;
     DWConvSpCHWMicrokernelTester()
@@ -180,9 +180,9 @@
         .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__sse);
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_eq_4) {
     TEST_REQUIRES_X86_SSE;
     DWConvSpCHWMicrokernelTester()
@@ -360,10 +360,10 @@
         .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, input_width_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     DWConvSpCHWMicrokernelTester()
@@ -531,10 +531,10 @@
         .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma);
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     DWConvSpCHWMicrokernelTester()
@@ -712,10 +712,10 @@
         .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, input_width_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     DWConvSpCHWMicrokernelTester()
@@ -885,10 +885,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     DWConvSpCHWMicrokernelTester()
@@ -1066,4 +1066,4 @@
         .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
diff --git a/test/f32-dwconv.cc b/test/f32-dwconv.cc
index f692a72..c9df99c 100644
--- a/test/f32-dwconv.cc
+++ b/test/f32-dwconv.cc
@@ -11,16 +11,16 @@
 //   Generator: tools/generate-dwconv-test.py
 
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
-#include <xnnpack/dwconv.h>
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
 
+#include <xnnpack/dwconv.h>
 #include "dwconv-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_DWCONV_UP4X9__AARCH64_NEONFMA, c_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     DWConvMicrokernelTester()
@@ -176,10 +176,10 @@
         .Test(xnn_f32_dwconv_ukernel_up4x9__aarch64_neonfma);
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_DWCONV_UP4X9__AARCH64_NEONFMA_CORTEX_A55, c_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     DWConvMicrokernelTester()
@@ -344,10 +344,10 @@
         .Test(xnn_f32_dwconv_ukernel_up4x9__aarch64_neonfma_cortex_a55);
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_DWCONV_UP4X9__NEONFMA, c_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     DWConvMicrokernelTester()
@@ -503,10 +503,10 @@
         .Test(xnn_f32_dwconv_ukernel_up4x9__neonfma);
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_DWCONV_UP4X9__NEON, c_eq_4) {
     TEST_REQUIRES_ARM_NEON;
     DWConvMicrokernelTester()
@@ -662,10 +662,10 @@
         .Test(xnn_f32_dwconv_ukernel_up4x9__neon);
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_DWCONV_UP4X25__SSE, c_eq_4) {
     TEST_REQUIRES_X86_SSE;
     DWConvMicrokernelTester()
@@ -821,10 +821,10 @@
         .Test(xnn_f32_dwconv_ukernel_up4x25__sse);
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_DWCONV_UP4X9__SSE, c_eq_4) {
     TEST_REQUIRES_X86_SSE;
     DWConvMicrokernelTester()
@@ -980,10 +980,10 @@
         .Test(xnn_f32_dwconv_ukernel_up4x9__sse);
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_DWCONV_UP4X4__SSE, c_eq_4) {
     TEST_REQUIRES_X86_SSE;
     DWConvMicrokernelTester()
@@ -1139,10 +1139,10 @@
         .Test(xnn_f32_dwconv_ukernel_up4x4__sse);
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_DWCONV_UP4X25__PSIMD, c_eq_4) {
     TEST_REQUIRES_PSIMD;
     DWConvMicrokernelTester()
@@ -1298,10 +1298,10 @@
         .Test(xnn_f32_dwconv_ukernel_up4x25__psimd, DWConvMicrokernelTester::Variant::Scalar);
     }
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_DWCONV_UP4X9__PSIMD, c_eq_4) {
     TEST_REQUIRES_PSIMD;
     DWConvMicrokernelTester()
@@ -1457,10 +1457,10 @@
         .Test(xnn_f32_dwconv_ukernel_up4x9__psimd, DWConvMicrokernelTester::Variant::Scalar);
     }
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_DWCONV_UP4X4__PSIMD, c_eq_4) {
     TEST_REQUIRES_PSIMD;
     DWConvMicrokernelTester()
@@ -1616,7 +1616,7 @@
         .Test(xnn_f32_dwconv_ukernel_up4x4__psimd, DWConvMicrokernelTester::Variant::Scalar);
     }
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
 TEST(F32_DWCONV_UP1X4__SCALAR, c_eq_1) {
diff --git a/test/f32-gavgpool-spchw.cc b/test/f32-gavgpool-spchw.cc
index f12e8e4..89ee4ea 100644
--- a/test/f32-gavgpool-spchw.cc
+++ b/test/f32-gavgpool-spchw.cc
@@ -3,16 +3,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/gavgpool.h>
 
+#include <xnnpack/gavgpool.h>
 #include "gavgpool-spchw-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GAVGPOOL_SPCHW__NEON_X4, elements_eq_4) {
     TEST_REQUIRES_ARM_NEON;
     GAvgPoolSpCHWMicrokernelTester()
@@ -108,10 +108,10 @@
         .Test(xnn_f32_gavgpool_spchw_ukernel__neon_x4);
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_GAVGPOOL_SPCHW__SSE_X4, elements_eq_4) {
     TEST_REQUIRES_X86_SSE;
     GAvgPoolSpCHWMicrokernelTester()
@@ -207,4 +207,4 @@
         .Test(xnn_f32_gavgpool_spchw_ukernel__sse_x4);
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/test/f32-gavgpool.cc b/test/f32-gavgpool.cc
index 95e44cc..95c3f5c 100644
--- a/test/f32-gavgpool.cc
+++ b/test/f32-gavgpool.cc
@@ -3,16 +3,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/gavgpool.h>
 
+#include <xnnpack/gavgpool.h>
 #include "gavgpool-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GAVGPOOL_UP7__NEON, n_eq_4_fulltile) {
     TEST_REQUIRES_ARM_NEON;
     GAvgPoolMicrokernelTester()
@@ -428,10 +428,10 @@
       } 
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_GAVGPOOL_UP7__SSE2, n_eq_4_fulltile) {
     TEST_REQUIRES_X86_SSE2;
     GAvgPoolMicrokernelTester()
@@ -847,10 +847,10 @@
       } 
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
   TEST(F32_GAVGPOOL_UP7__PSIMD, n_eq_4_fulltile) {
     TEST_REQUIRES_PSIMD;
     GAvgPoolMicrokernelTester()
@@ -1266,7 +1266,7 @@
       } 
     }
   }
-#endif  // !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
 
 TEST(F32_GAVGPOOL_UP7__SCALAR, n_eq_1_fulltile) {
diff --git a/test/f32-gemm.cc b/test/f32-gemm.cc
index 9bcb043..e8858ec 100644
--- a/test/f32-gemm.cc
+++ b/test/f32-gemm.cc
@@ -11,18 +11,18 @@
 //   Generator: tools/generate-gemm-test.py
 
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
 #include <xnnpack/gemm.h>
 #include <xnnpack/igemm.h>
 #include <xnnpack/ppmm.h>
-#include <xnnpack/isa-checks.h>
-
 #include "gemm-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -520,10 +520,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -1021,10 +1021,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -1522,10 +1522,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -2023,10 +2023,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -2524,10 +2524,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -3025,10 +3025,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -3526,10 +3526,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -4027,10 +4027,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -4528,10 +4528,10 @@
       .cm_stride(17)
       .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -5029,10 +5029,10 @@
       .cm_stride(17)
       .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -5485,10 +5485,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -5941,10 +5941,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -6397,10 +6397,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -6853,10 +6853,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMM_4X12__NEON_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -7309,10 +7309,10 @@
       .cm_stride(17)
       .Test(xnn_f32_gemm_ukernel_4x12__neon_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMM_1X8__NEON_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -7765,10 +7765,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMM_4X2__NEON_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -8221,10 +8221,10 @@
       .cm_stride(5)
       .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMM_4X8__NEON_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -8677,10 +8677,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMM_4X8__NEON_LD128, k_eq_4) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -9133,10 +9133,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMM_5X8__NEON_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -9589,10 +9589,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMM_6X8__NEON_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -10045,10 +10045,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMM_1X8__NEONFMA_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -10501,10 +10501,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMM_4X12__NEONFMA_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -10957,10 +10957,10 @@
       .cm_stride(17)
       .Test(xnn_f32_gemm_ukernel_4x12__neonfma_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMM_4X8__NEONFMA_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -11413,10 +11413,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMM_4X8__NEONFMA_LD128, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -11869,10 +11869,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMM_5X8__NEONFMA_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -12325,10 +12325,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMM_6X8__NEONFMA_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -12781,10 +12781,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_GEMM_1X8__SSE_LOAD1, k_eq_1) {
     TEST_REQUIRES_X86_SSE;
     GemmMicrokernelTester()
@@ -13135,10 +13135,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_GEMM_4X8__SSE_LOAD1, k_eq_1) {
     TEST_REQUIRES_X86_SSE;
     GemmMicrokernelTester()
@@ -13489,10 +13489,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_GEMM_1X8__SSE_DUP, k_eq_4) {
     TEST_REQUIRES_X86_SSE;
     GemmMicrokernelTester()
@@ -13945,10 +13945,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_GEMM_4X8__SSE_DUP, k_eq_4) {
     TEST_REQUIRES_X86_SSE;
     GemmMicrokernelTester()
@@ -14401,10 +14401,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_GEMM_1X8S4__SSE, k_eq_4) {
     TEST_REQUIRES_X86_SSE;
     GemmMicrokernelTester()
@@ -14857,10 +14857,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_GEMM_4X8S4__SSE, k_eq_4) {
     TEST_REQUIRES_X86_SSE;
     GemmMicrokernelTester()
@@ -15313,10 +15313,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, k_eq_1) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -15667,10 +15667,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, k_eq_1) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -16021,10 +16021,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, k_eq_1) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -16375,10 +16375,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -16831,10 +16831,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -17287,10 +17287,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -17743,10 +17743,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_GEMM_1X8S4__PSIMD, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -18199,10 +18199,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_GEMM_4X8S4__PSIMD, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -18655,10 +18655,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_GEMM_6X8S4__PSIMD, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -19111,7 +19111,7 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
 TEST(F32_GEMM_1X4__SCALAR, k_eq_1) {
diff --git a/test/f32-gemminc.cc b/test/f32-gemminc.cc
index ade257d..f7774a1 100644
--- a/test/f32-gemminc.cc
+++ b/test/f32-gemminc.cc
@@ -11,18 +11,18 @@
 //   Generator: tools/generate-gemm-test.py
 
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
 #include <xnnpack/gemm.h>
 #include <xnnpack/igemm.h>
 #include <xnnpack/ppmm.h>
-#include <xnnpack/isa-checks.h>
-
 #include "gemm-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -520,10 +520,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -1021,10 +1021,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -1522,10 +1522,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -2023,10 +2023,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -2524,10 +2524,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -3025,10 +3025,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -3526,10 +3526,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -4027,10 +4027,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -4528,10 +4528,10 @@
       .cm_stride(17)
       .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -5029,10 +5029,10 @@
       .cm_stride(17)
       .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -5485,10 +5485,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -5941,10 +5941,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -6397,10 +6397,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -6853,10 +6853,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMMINC_4X12__NEON_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -7309,10 +7309,10 @@
       .cm_stride(17)
       .Test(xnn_f32_gemminc_ukernel_4x12__neon_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMMINC_1X8__NEON_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -7765,10 +7765,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMMINC_4X8__NEON_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -8221,10 +8221,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMMINC_4X8__NEON_LD128, k_eq_4) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -8677,10 +8677,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMMINC_5X8__NEON_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -9133,10 +9133,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMMINC_6X8__NEON_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -9589,10 +9589,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMMINC_1X8__NEONFMA_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -10045,10 +10045,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMMINC_4X12__NEONFMA_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -10501,10 +10501,10 @@
       .cm_stride(17)
       .Test(xnn_f32_gemminc_ukernel_4x12__neonfma_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMMINC_4X8__NEONFMA_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -10957,10 +10957,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMMINC_4X8__NEONFMA_LD128, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -11413,10 +11413,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMMINC_5X8__NEONFMA_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -11869,10 +11869,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_GEMMINC_6X8__NEONFMA_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -12325,10 +12325,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_GEMMINC_1X8__SSE_LOAD1, k_eq_1) {
     TEST_REQUIRES_X86_SSE;
     GemmMicrokernelTester()
@@ -12679,10 +12679,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_GEMMINC_4X8__SSE_LOAD1, k_eq_1) {
     TEST_REQUIRES_X86_SSE;
     GemmMicrokernelTester()
@@ -13033,10 +13033,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_GEMMINC_1X8__SSE_DUP, k_eq_4) {
     TEST_REQUIRES_X86_SSE;
     GemmMicrokernelTester()
@@ -13489,10 +13489,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_GEMMINC_4X8__SSE_DUP, k_eq_4) {
     TEST_REQUIRES_X86_SSE;
     GemmMicrokernelTester()
@@ -13945,10 +13945,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_GEMMINC_1X8S4__SSE, k_eq_4) {
     TEST_REQUIRES_X86_SSE;
     GemmMicrokernelTester()
@@ -14401,10 +14401,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_GEMMINC_4X8S4__SSE, k_eq_4) {
     TEST_REQUIRES_X86_SSE;
     GemmMicrokernelTester()
@@ -14857,10 +14857,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, k_eq_1) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -15211,10 +15211,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, k_eq_1) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -15565,10 +15565,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, k_eq_1) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -15919,10 +15919,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -16375,10 +16375,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -16831,10 +16831,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -17287,10 +17287,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_GEMMINC_1X8S4__PSIMD, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -17743,10 +17743,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_GEMMINC_4X8S4__PSIMD, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -18199,10 +18199,10 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_GEMMINC_6X8S4__PSIMD, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -18655,7 +18655,7 @@
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
 TEST(F32_GEMMINC_1X4__SCALAR, k_eq_1) {
diff --git a/test/f32-hswish.cc b/test/f32-hswish.cc
index 97eee7b..05a3861 100644
--- a/test/f32-hswish.cc
+++ b/test/f32-hswish.cc
@@ -3,16 +3,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/hswish.h>
 
+#include <xnnpack/hswish.h>
 #include "hswish-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_HSWISH__NEON, n_eq_4) {
     TEST_REQUIRES_ARM_NEON;
     HSwishMicrokernelTester()
@@ -102,10 +102,10 @@
         .Test(xnn_f32_hswish_ukernel__neonfma);
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_HSWISH__SSE, n_eq_4) {
     TEST_REQUIRES_X86_SSE2;
     HSwishMicrokernelTester()
@@ -150,10 +150,10 @@
         .Test(xnn_f32_hswish_ukernel__sse);
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
   TEST(F32_HSWISH__PSIMD, n_eq_4) {
     TEST_REQUIRES_PSIMD;
     HSwishMicrokernelTester()
@@ -198,7 +198,7 @@
         .Test(xnn_f32_hswish_ukernel__psimd, HSwishMicrokernelTester::Variant::Scalar);
     }
   }
-#endif  // !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
 
 TEST(F32_HSWISH__SCALAR, n_eq_1) {
diff --git a/test/f32-igemm.cc b/test/f32-igemm.cc
index c665194..a64615d 100644
--- a/test/f32-igemm.cc
+++ b/test/f32-igemm.cc
@@ -11,18 +11,18 @@
 //   Generator: tools/generate-gemm-test.py
 
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
 #include <xnnpack/gemm.h>
 #include <xnnpack/igemm.h>
 #include <xnnpack/ppmm.h>
-#include <xnnpack/isa-checks.h>
-
 #include "gemm-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -518,10 +518,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -1017,10 +1017,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -1516,10 +1516,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -2015,10 +2015,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -2514,10 +2514,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -3013,10 +3013,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -3512,10 +3512,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -4011,10 +4011,10 @@
       .cm_stride(17)
       .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -4510,10 +4510,10 @@
       .cm_stride(17)
       .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_IGEMM_4X12__NEON_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -4978,10 +4978,10 @@
       .cm_stride(17)
       .Test(xnn_f32_igemm_ukernel_4x12__neon_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_IGEMM_1X8__NEON_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -5446,10 +5446,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_IGEMM_4X2__NEON_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -5914,10 +5914,10 @@
       .cm_stride(5)
       .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_IGEMM_4X4__NEON_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -6382,10 +6382,10 @@
       .cm_stride(7)
       .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_IGEMM_4X8__NEON_LD128, k_eq_4) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -6850,10 +6850,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_IGEMM_4X8__NEON_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -7318,10 +7318,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_IGEMM_6X8__NEON_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -7786,10 +7786,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_IGEMM_4X12__NEONFMA_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -8254,10 +8254,10 @@
       .cm_stride(17)
       .Test(xnn_f32_igemm_ukernel_4x12__neonfma_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_IGEMM_4X2__NEONFMA_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -8722,10 +8722,10 @@
       .cm_stride(5)
       .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_IGEMM_4X4__NEONFMA_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -9190,10 +9190,10 @@
       .cm_stride(7)
       .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_IGEMM_4X8__NEONFMA_LD128, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -9658,10 +9658,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_IGEMM_4X8__NEONFMA_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -10126,10 +10126,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_IGEMM_6X8__NEONFMA_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -10594,10 +10594,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_IGEMM_1X8__SSE_LOAD1, k_eq_1) {
     TEST_REQUIRES_X86_SSE;
     GemmMicrokernelTester()
@@ -10992,10 +10992,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_IGEMM_4X8__SSE_LOAD1, k_eq_1) {
     TEST_REQUIRES_X86_SSE;
     GemmMicrokernelTester()
@@ -11390,10 +11390,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_IGEMM_1X8__SSE_DUP, k_eq_4) {
     TEST_REQUIRES_X86_SSE;
     GemmMicrokernelTester()
@@ -11858,10 +11858,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_IGEMM_4X8__SSE_DUP, k_eq_4) {
     TEST_REQUIRES_X86_SSE;
     GemmMicrokernelTester()
@@ -12326,10 +12326,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_IGEMM_4X2C4__SSE, k_eq_4) {
     TEST_REQUIRES_X86_SSE;
     GemmMicrokernelTester()
@@ -12794,10 +12794,10 @@
       .cm_stride(5)
       .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_IGEMM_4X2C4__PSIMD, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -13262,10 +13262,10 @@
       .cm_stride(5)
       .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, k_eq_1) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -13660,10 +13660,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, k_eq_1) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -14058,10 +14058,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, k_eq_1) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -14456,10 +14456,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -14924,10 +14924,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -15392,10 +15392,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -15860,10 +15860,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_IGEMM_1X8S4__PSIMD, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -16328,10 +16328,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_IGEMM_4X8S4__PSIMD, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -16796,10 +16796,10 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_IGEMM_6X8S4__PSIMD, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -17264,7 +17264,7 @@
       .cm_stride(11)
       .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
 TEST(F32_IGEMM_1X4__SCALAR, k_eq_1) {
diff --git a/test/f32-maxpool.cc b/test/f32-maxpool.cc
index da38a88..159a907 100644
--- a/test/f32-maxpool.cc
+++ b/test/f32-maxpool.cc
@@ -3,16 +3,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/maxpool.h>
 
+#include <xnnpack/maxpool.h>
 #include "maxpool-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_unipass_fulltile) {
     TEST_REQUIRES_X86_SSE;
     auto tester = MaxPoolMicrokernelTester()
@@ -1214,10 +1214,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
   TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_unipass_fulltile) {
     TEST_REQUIRES_PSIMD;
     auto tester = MaxPoolMicrokernelTester()
@@ -2419,7 +2419,7 @@
       }
     }
   }
-#endif  // !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
 
 TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_unipass_fulltile) {
diff --git a/test/f32-pavgpool.cc b/test/f32-pavgpool.cc
index c4c2f90..4c7be12 100644
--- a/test/f32-pavgpool.cc
+++ b/test/f32-pavgpool.cc
@@ -3,16 +3,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/pavgpool.h>
 
+#include <xnnpack/pavgpool.h>
 #include "avgpool-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_PAVGPOOL_UP9__NEON, kc_eq_4_fulltile) {
     TEST_REQUIRES_ARM_NEON;
     auto tester = AvgPoolMicrokernelTester()
@@ -942,10 +942,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_PAVGPOOL_UP9__SSE2, kc_eq_4_fulltile) {
     TEST_REQUIRES_X86_SSE2;
     auto tester = AvgPoolMicrokernelTester()
@@ -1875,10 +1875,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
   TEST(F32_PAVGPOOL_UP9__PSIMD, kc_eq_4_fulltile) {
     TEST_REQUIRES_PSIMD;
     auto tester = AvgPoolMicrokernelTester()
@@ -2808,7 +2808,7 @@
       }
     }
   }
-#endif  // !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
 
 TEST(F32_PAVGPOOL_UP9__SCALAR, kc_eq_1_fulltile) {
diff --git a/test/f32-ppmm.cc b/test/f32-ppmm.cc
index 659dd52..773806b 100644
--- a/test/f32-ppmm.cc
+++ b/test/f32-ppmm.cc
@@ -11,18 +11,18 @@
 //   Generator: tools/generate-gemm-test.py
 
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
 #include <xnnpack/gemm.h>
 #include <xnnpack/igemm.h>
 #include <xnnpack/ppmm.h>
-#include <xnnpack/isa-checks.h>
-
 #include "gemm-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_PPMM_4X8__NEON, k_eq_1) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -357,10 +357,10 @@
       .cm_stride(11)
       .Test(xnn_f32_ppmm_ukernel_4x8__neon);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_PPMM_4X8__NEONFMA, k_eq_1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -695,10 +695,10 @@
       .cm_stride(11)
       .Test(xnn_f32_ppmm_ukernel_4x8__neonfma);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_PPMM_8X8__NEON, k_eq_1) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -1033,10 +1033,10 @@
       .cm_stride(11)
       .Test(xnn_f32_ppmm_ukernel_8x8__neon);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_PPMM_8X8__NEONFMA, k_eq_1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
@@ -1371,10 +1371,10 @@
       .cm_stride(11)
       .Test(xnn_f32_ppmm_ukernel_8x8__neonfma);
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_PPMM_4X8__SSE, k_eq_1) {
     TEST_REQUIRES_X86_SSE;
     GemmMicrokernelTester()
@@ -1709,10 +1709,10 @@
       .cm_stride(11)
       .Test(xnn_f32_ppmm_ukernel_4x8__sse);
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_PPMM_4X8__PSIMD, k_eq_1) {
     TEST_REQUIRES_PSIMD;
     GemmMicrokernelTester()
@@ -2047,7 +2047,7 @@
       .cm_stride(11)
       .Test(xnn_f32_ppmm_ukernel_4x8__psimd, GemmMicrokernelTester::Variant::Scalar);
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
 TEST(F32_PPMM_4X2__SCALAR, k_eq_1) {
diff --git a/test/f32-prelu.cc b/test/f32-prelu.cc
index 43e248d..306601d 100644
--- a/test/f32-prelu.cc
+++ b/test/f32-prelu.cc
@@ -3,16 +3,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/prelu.h>
 
+#include <xnnpack/prelu.h>
 #include "prelu-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_PRELU_X4__SSE2, fulltile_n_eq_4) {
     TEST_REQUIRES_X86_SSE2;
     PReLUMicrokernelTester()
@@ -181,10 +181,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
   TEST(F32_PRELU_X4__PSIMD, fulltile_n_eq_4) {
     TEST_REQUIRES_PSIMD;
     PReLUMicrokernelTester()
@@ -353,7 +353,7 @@
       }
     }
   }
-#endif  // !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
 
 TEST(F32_PRELU_X4__SCALAR, fulltile_n_eq_1) {
diff --git a/test/f32-rmax.cc b/test/f32-rmax.cc
index 5661f32..5d8bf06 100644
--- a/test/f32-rmax.cc
+++ b/test/f32-rmax.cc
@@ -3,16 +3,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/rmax.h>
 
+#include <xnnpack/rmax.h>
 #include "rmax-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_RMAX__NEON, n_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t n = 1; n < 16; n++) {
@@ -46,9 +46,9 @@
         .Test(xnn_f32_rmax_ukernel__neon);
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_RMAX__SSE, n_lt_16) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t n = 1; n < 16; n++) {
@@ -150,7 +150,7 @@
         .Test(xnn_f32_rmax_ukernel__avx512f);
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 TEST(F32_RMAX__SCALAR, n_lt_4) {
   for (size_t n = 1; n < 4; n++) {
diff --git a/test/f32-spmm.cc b/test/f32-spmm.cc
index 535457e..cb3f72a 100644
--- a/test/f32-spmm.cc
+++ b/test/f32-spmm.cc
@@ -8,16 +8,16 @@
 //   Generator: tools/generate-spmm-test.py
 
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
-#include <xnnpack/spmm.h>
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
 
+#include <xnnpack/spmm.h>
 #include "spmm-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_SPMM_4X1__NEONFMA, k_eq_1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     SpMMMicrokernelTester()
@@ -179,10 +179,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_SPMM_4X2__NEONFMA, k_eq_1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     SpMMMicrokernelTester()
@@ -389,10 +389,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_SPMM_4X4__NEONFMA, k_eq_1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     SpMMMicrokernelTester()
@@ -599,10 +599,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_SPMM_4X1__NEONFMA_PIPELINED, k_eq_1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     SpMMMicrokernelTester()
@@ -764,10 +764,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_SPMM_4X1__NEONFMA_UNROLL2, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     SpMMMicrokernelTester()
@@ -957,10 +957,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_SPMM_8X1__NEONFMA, k_eq_1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     SpMMMicrokernelTester()
@@ -1122,10 +1122,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_SPMM_8X2__NEONFMA, k_eq_1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     SpMMMicrokernelTester()
@@ -1332,10 +1332,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_SPMM_8X4__NEONFMA, k_eq_1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     SpMMMicrokernelTester()
@@ -1542,10 +1542,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_SPMM_8X1__NEONFMA_PIPELINED, k_eq_1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     SpMMMicrokernelTester()
@@ -1707,10 +1707,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_SPMM_8X1__NEONFMA_UNROLL2, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     SpMMMicrokernelTester()
@@ -1900,10 +1900,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_SPMM_12X1__NEONFMA, k_eq_1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     SpMMMicrokernelTester()
@@ -2065,10 +2065,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_SPMM_12X2__NEONFMA, k_eq_1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     SpMMMicrokernelTester()
@@ -2275,10 +2275,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_SPMM_12X4__NEONFMA, k_eq_1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     SpMMMicrokernelTester()
@@ -2485,10 +2485,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_SPMM_16X1__NEONFMA, k_eq_1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     SpMMMicrokernelTester()
@@ -2650,10 +2650,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_SPMM_16X2__NEONFMA, k_eq_1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     SpMMMicrokernelTester()
@@ -2860,10 +2860,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_SPMM_16X4__NEONFMA, k_eq_1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     SpMMMicrokernelTester()
@@ -3070,10 +3070,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_SPMM_16X1__NEONFMA_PIPELINED, k_eq_1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     SpMMMicrokernelTester()
@@ -3235,10 +3235,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM64
   TEST(F32_SPMM_16X1__NEONFMA_UNROLL2, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     SpMMMicrokernelTester()
@@ -3428,10 +3428,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_SPMM_4X1__SSE, k_eq_1) {
     TEST_REQUIRES_X86_SSE;
     SpMMMicrokernelTester()
@@ -3593,10 +3593,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_SPMM_8X1__SSE, k_eq_1) {
     TEST_REQUIRES_X86_SSE;
     SpMMMicrokernelTester()
@@ -3758,7 +3758,7 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 TEST(F32_SPMM_1X1__SCALAR, k_eq_1) {
diff --git a/test/f32-vadd.cc b/test/f32-vadd.cc
index de89b23..cc91e5e 100644
--- a/test/f32-vadd.cc
+++ b/test/f32-vadd.cc
@@ -3,16 +3,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/vadd.h>
 
+#include <xnnpack/vadd.h>
 #include "vadd-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_VADD__SSE, n_eq_4) {
     TEST_REQUIRES_X86_SSE;
     VAddMicrokernelTester()
@@ -116,10 +116,10 @@
         .Test(xnn_f32_vadd_ukernel__sse);
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
   TEST(F32_VADD__PSIMD, n_eq_4) {
     TEST_REQUIRES_PSIMD;
     VAddMicrokernelTester()
@@ -223,7 +223,7 @@
         .Test(xnn_f32_vadd_ukernel__psimd, VAddMicrokernelTester::Variant::Scalar);
     }
   }
-#endif  // !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
 
 TEST(F32_VADD__SCALAR, n_eq_1) {
diff --git a/test/f32-vmul.cc b/test/f32-vmul.cc
index 81c6200..6230023 100644
--- a/test/f32-vmul.cc
+++ b/test/f32-vmul.cc
@@ -3,16 +3,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/vmul.h>
 
+#include <xnnpack/vmul.h>
 #include "vmul-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_VMUL__SSE, n_eq_4) {
     TEST_REQUIRES_X86_SSE;
     VMulMicrokernelTester()
@@ -116,10 +116,10 @@
         .Test(xnn_f32_vmul_ukernel__sse);
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
   TEST(F32_VMUL__PSIMD, n_eq_4) {
     TEST_REQUIRES_PSIMD;
     VMulMicrokernelTester()
@@ -223,7 +223,7 @@
         .Test(xnn_f32_vmul_ukernel__psimd, VMulMicrokernelTester::Variant::Scalar);
     }
   }
-#endif  // !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
 
 TEST(F32_VMUL__SCALAR, n_eq_1) {
diff --git a/test/f32-vmulcaddc.cc b/test/f32-vmulcaddc.cc
index b925e24..ca1ac82 100644
--- a/test/f32-vmulcaddc.cc
+++ b/test/f32-vmulcaddc.cc
@@ -8,16 +8,16 @@
 //   Generator: tools/generate-vmulcaddc-test.py
 
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
-#include <xnnpack/vmulcaddc.h>
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
 
+#include <xnnpack/vmulcaddc.h>
 #include "vmulcaddc-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_VMULCADDC_C4__NEONFMA_X2, c_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     VMulCAddCMicrokernelTester()
@@ -155,10 +155,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_VMULCADDC_C4__NEON_X2, c_eq_4) {
     TEST_REQUIRES_ARM_NEON;
     VMulCAddCMicrokernelTester()
@@ -296,10 +296,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_VMULCADDC_C4__SSE_X2, c_eq_4) {
     TEST_REQUIRES_X86_SSE;
     VMulCAddCMicrokernelTester()
@@ -437,10 +437,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_VMULCADDC_C4__PSIMD_X2, c_eq_4) {
     TEST_REQUIRES_PSIMD;
     VMulCAddCMicrokernelTester()
@@ -578,7 +578,7 @@
       }
     }
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
 TEST(F32_VMULCADDC_C1__SCALAR_X2, c_eq_1) {
diff --git a/test/f32-vsub.cc b/test/f32-vsub.cc
index 65cd8db..9f7b1a8 100644
--- a/test/f32-vsub.cc
+++ b/test/f32-vsub.cc
@@ -3,16 +3,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/vsub.h>
 
+#include <xnnpack/vsub.h>
 #include "vsub-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_VSUB__SSE, n_eq_4) {
     TEST_REQUIRES_X86_SSE;
     VSubMicrokernelTester()
@@ -104,10 +104,10 @@
         .Test(xnn_f32_vsub_ukernel__sse);
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
   TEST(F32_VSUB__PSIMD, n_eq_4) {
     TEST_REQUIRES_PSIMD;
     VSubMicrokernelTester()
@@ -199,7 +199,7 @@
         .Test(xnn_f32_vsub_ukernel__psimd, VSubMicrokernelTester::Variant::Scalar);
     }
   }
-#endif  // !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
 
 TEST(F32_VSUB__SCALAR, n_eq_1) {
diff --git a/test/q8-avgpool.cc b/test/q8-avgpool.cc
index 1c7a8d9..86104cd 100644
--- a/test/q8-avgpool.cc
+++ b/test/q8-avgpool.cc
@@ -6,16 +6,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/avgpool.h>
 
+#include <xnnpack/avgpool.h>
 #include "avgpool-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(Q8_AVGPOOL_UP9__NEON, kc_eq_8_fulltile) {
     TEST_REQUIRES_ARM_NEON;
     auto tester = AvgPoolMicrokernelTester()
@@ -1117,9 +1117,9 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(Q8_AVGPOOL_UP9__SSE2, kc_eq_8_fulltile) {
     TEST_REQUIRES_X86_SSE2;
     auto tester = AvgPoolMicrokernelTester()
@@ -2221,7 +2221,7 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 TEST(Q8_AVGPOOL_UP9__SCALAR, kc_eq_1_fulltile) {
   auto tester = AvgPoolMicrokernelTester()
diff --git a/test/q8-dwconv.cc b/test/q8-dwconv.cc
index 4786906..8a0c41a 100644
--- a/test/q8-dwconv.cc
+++ b/test/q8-dwconv.cc
@@ -11,16 +11,16 @@
 //   Generator: tools/generate-dwconv-test.py
 
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
-#include <xnnpack/dwconv.h>
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
 
+#include <xnnpack/dwconv.h>
 #include "dwconv-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM
+#if XNN_ARCH_ARM
   TEST(Q8_DWCONV_UP8X9__AARCH32_NEON, c_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     DWConvMicrokernelTester()
@@ -204,10 +204,10 @@
         .Test(xnn_q8_dwconv_ukernel_up8x9__aarch32_neon);
     }
   }
-#endif  // CPUINFO_ARCH_ARM
+#endif  // XNN_ARCH_ARM
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(Q8_DWCONV_UP8X9__NEON, c_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     DWConvMicrokernelTester()
@@ -391,10 +391,10 @@
         .Test(xnn_q8_dwconv_ukernel_up8x9__neon);
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(Q8_DWCONV_UP8X9__SSE2, c_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     DWConvMicrokernelTester()
@@ -578,7 +578,7 @@
         .Test(xnn_q8_dwconv_ukernel_up8x9__sse2);
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 TEST(Q8_DWCONV_UP1X9__SCALAR, c_eq_1) {
diff --git a/test/q8-gavgpool.cc b/test/q8-gavgpool.cc
index 779702d..79a27bf 100644
--- a/test/q8-gavgpool.cc
+++ b/test/q8-gavgpool.cc
@@ -6,16 +6,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/gavgpool.h>
 
+#include <xnnpack/gavgpool.h>
 #include "gavgpool-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(Q8_GAVGPOOL_UP7__NEON, n_eq_8_fulltile) {
     TEST_REQUIRES_ARM_NEON;
     GAvgPoolMicrokernelTester()
@@ -775,9 +775,9 @@
       } 
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(Q8_GAVGPOOL_UP7__SSE2, n_eq_8_fulltile) {
     TEST_REQUIRES_X86_SSE2;
     GAvgPoolMicrokernelTester()
@@ -1537,7 +1537,7 @@
       } 
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 TEST(Q8_GAVGPOOL_UP7__SCALAR, n_eq_1_fulltile) {
   GAvgPoolMicrokernelTester()
diff --git a/test/q8-gemm.cc b/test/q8-gemm.cc
index 796cb7d..847e9c2 100644
--- a/test/q8-gemm.cc
+++ b/test/q8-gemm.cc
@@ -11,18 +11,18 @@
 //   Generator: tools/generate-gemm-test.py
 
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
 #include <xnnpack/gemm.h>
 #include <xnnpack/igemm.h>
 #include <xnnpack/ppmm.h>
-#include <xnnpack/isa-checks.h>
-
 #include "gemm-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(Q8_GEMM_4X8__NEON, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -524,10 +524,10 @@
         .Test(xnn_q8_gemm_ukernel_4x8__neon);
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(Q8_GEMM_8X8__NEON, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -1029,10 +1029,10 @@
         .Test(xnn_q8_gemm_ukernel_8x8__neon);
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(Q8_GEMM_2X4C8__SSE2, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
@@ -1534,10 +1534,10 @@
         .Test(xnn_q8_gemm_ukernel_2x4c8__sse2);
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(Q8_GEMM_4X4C2__SSE2, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
@@ -2039,7 +2039,7 @@
         .Test(xnn_q8_gemm_ukernel_4x4c2__sse2);
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 TEST(Q8_GEMM_2X2__SCALAR, k_eq_1) {
diff --git a/test/q8-igemm.cc b/test/q8-igemm.cc
index bc8f45a..e10829c 100644
--- a/test/q8-igemm.cc
+++ b/test/q8-igemm.cc
@@ -11,18 +11,18 @@
 //   Generator: tools/generate-gemm-test.py
 
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
 #include <xnnpack/gemm.h>
 #include <xnnpack/igemm.h>
 #include <xnnpack/ppmm.h>
-#include <xnnpack/isa-checks.h>
-
 #include "gemm-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(Q8_IGEMM_4X8__NEON, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -536,10 +536,10 @@
         .Test(xnn_q8_igemm_ukernel_4x8__neon);
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(Q8_IGEMM_8X8__NEON, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -1053,10 +1053,10 @@
         .Test(xnn_q8_igemm_ukernel_8x8__neon);
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(Q8_IGEMM_4X4C2__SSE2, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
@@ -1570,7 +1570,7 @@
         .Test(xnn_q8_igemm_ukernel_4x4c2__sse2);
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 TEST(Q8_IGEMM_2X2__SCALAR, k_eq_1) {
diff --git a/test/q8-vadd.cc b/test/q8-vadd.cc
index b075792..5cd7715 100644
--- a/test/q8-vadd.cc
+++ b/test/q8-vadd.cc
@@ -6,16 +6,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/vadd.h>
 
+#include <xnnpack/vadd.h>
 #include "vadd-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(Q8_VADD__SSE2, n_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     VAddMicrokernelTester()
@@ -183,9 +183,9 @@
         .Test(xnn_q8_vadd_ukernel__sse2);
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(Q8_VADD__NEON, n_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     VAddMicrokernelTester()
@@ -353,7 +353,7 @@
         .Test(xnn_q8_vadd_ukernel__neon);
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 TEST(Q8_VADD__SCALAR, n_eq_1) {
   VAddMicrokernelTester()
diff --git a/test/requantization.cc b/test/requantization.cc
index a5a4edf..2337722 100644
--- a/test/requantization.cc
+++ b/test/requantization.cc
@@ -10,10 +10,11 @@
 #include <cstddef>
 #include <cstdlib>
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
-#include <xnnpack/requantization-stubs.h>
 
+#include <xnnpack/common.h>
+
+#include <xnnpack/requantization-stubs.h>
 #include "requantization-tester.h"
 
 
@@ -398,750 +399,746 @@
 }
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  /*
+   * Precise SSE2 implementation using floating-point shuffle.
+   */
 
-/*
- * Precise SSE2 implementation using floating-point shuffle.
- */
-
-TEST(PRECISE__SSE2, exact_divide_by_po2) {
-  for (uint32_t s = 1; s < 32; s++) {
-    RequantizationTester()
-      .s(s)
-      .testExactDivideByPO2(xnn_requantize_precise__sse2);
-  }
-}
-
-TEST(PRECISE__SSE2, exact_divide_by_po2_with_zero_point) {
-  for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+  TEST(PRECISE__SSE2, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
-        .zeroPoint(zeroPoint)
         .s(s)
         .testExactDivideByPO2(xnn_requantize_precise__sse2);
     }
   }
-}
 
-TEST(PRECISE__SSE2, divide_by_po2_with_rounding_up) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingUp(xnn_requantize_precise__sse2);
+  TEST(PRECISE__SSE2, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testExactDivideByPO2(xnn_requantize_precise__sse2);
+      }
     }
   }
-}
 
-TEST(PRECISE__SSE2, divide_by_po2_with_rounding_down) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingDown(xnn_requantize_precise__sse2);
+  TEST(PRECISE__SSE2, divide_by_po2_with_rounding_up) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingUp(xnn_requantize_precise__sse2);
+      }
     }
   }
-}
 
-TEST(PRECISE__SSE2, divide_by_po2_with_rounding_away) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingAway(xnn_requantize_precise__sse2);
+  TEST(PRECISE__SSE2, divide_by_po2_with_rounding_down) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingDown(xnn_requantize_precise__sse2);
+      }
     }
   }
-}
 
-TEST(PRECISE__SSE2, special_cases) {
-  RequantizationTester()
-    .testSpecialCases(xnn_requantize_precise__sse2);
-}
+  TEST(PRECISE__SSE2, divide_by_po2_with_rounding_away) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingAway(xnn_requantize_precise__sse2);
+      }
+    }
+  }
 
-TEST(PRECISE__SSE2, random_cases) {
-  RequantizationTester()
-    .iterations(100)
-    .testRandomCasesPrecise(xnn_requantize_precise__sse2);
-}
-
-
-/*
- * Precise SSSE3 implementation using floating-point shuffle.
- */
-
-TEST(PRECISE__SSSE3, exact_divide_by_po2) {
-  for (uint32_t s = 1; s < 32; s++) {
+  TEST(PRECISE__SSE2, special_cases) {
     RequantizationTester()
-      .s(s)
-      .testExactDivideByPO2(xnn_requantize_precise__ssse3);
+      .testSpecialCases(xnn_requantize_precise__sse2);
   }
-}
 
-TEST(PRECISE__SSSE3, exact_divide_by_po2_with_zero_point) {
-  for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+  TEST(PRECISE__SSE2, random_cases) {
+    RequantizationTester()
+      .iterations(100)
+      .testRandomCasesPrecise(xnn_requantize_precise__sse2);
+  }
+
+
+  /*
+   * Precise SSSE3 implementation using floating-point shuffle.
+   */
+
+  TEST(PRECISE__SSSE3, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
-        .zeroPoint(zeroPoint)
         .s(s)
         .testExactDivideByPO2(xnn_requantize_precise__ssse3);
     }
   }
-}
 
-TEST(PRECISE__SSSE3, divide_by_po2_with_rounding_up) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingUp(xnn_requantize_precise__ssse3);
+  TEST(PRECISE__SSSE3, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testExactDivideByPO2(xnn_requantize_precise__ssse3);
+      }
     }
   }
-}
 
-TEST(PRECISE__SSSE3, divide_by_po2_with_rounding_down) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingDown(xnn_requantize_precise__ssse3);
+  TEST(PRECISE__SSSE3, divide_by_po2_with_rounding_up) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingUp(xnn_requantize_precise__ssse3);
+      }
     }
   }
-}
 
-TEST(PRECISE__SSSE3, divide_by_po2_with_rounding_away) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingAway(xnn_requantize_precise__ssse3);
+  TEST(PRECISE__SSSE3, divide_by_po2_with_rounding_down) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingDown(xnn_requantize_precise__ssse3);
+      }
     }
   }
-}
 
-TEST(PRECISE__SSSE3, special_cases) {
-  RequantizationTester()
-    .testSpecialCases(xnn_requantize_precise__ssse3);
-}
+  TEST(PRECISE__SSSE3, divide_by_po2_with_rounding_away) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingAway(xnn_requantize_precise__ssse3);
+      }
+    }
+  }
 
-TEST(PRECISE__SSSE3, random_cases) {
-  RequantizationTester()
-    .iterations(100)
-    .testRandomCasesPrecise(xnn_requantize_precise__ssse3);
-}
-
-
-/*
- * Precise SSE4.1 implementation using static blend instruction.
- */
-
-TEST(PRECISE__SSE4, exact_divide_by_po2) {
-  for (uint32_t s = 1; s < 32; s++) {
+  TEST(PRECISE__SSSE3, special_cases) {
     RequantizationTester()
-      .s(s)
-      .testExactDivideByPO2(xnn_requantize_precise__sse4);
+      .testSpecialCases(xnn_requantize_precise__ssse3);
   }
-}
 
-TEST(PRECISE__SSE4, exact_divide_by_po2_with_zero_point) {
-  for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+  TEST(PRECISE__SSSE3, random_cases) {
+    RequantizationTester()
+      .iterations(100)
+      .testRandomCasesPrecise(xnn_requantize_precise__ssse3);
+  }
+
+
+  /*
+   * Precise SSE4.1 implementation using static blend instruction.
+   */
+
+  TEST(PRECISE__SSE4, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
-        .zeroPoint(zeroPoint)
         .s(s)
         .testExactDivideByPO2(xnn_requantize_precise__sse4);
     }
   }
-}
 
-TEST(PRECISE__SSE4, divide_by_po2_with_rounding_up) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingUp(xnn_requantize_precise__sse4);
+  TEST(PRECISE__SSE4, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testExactDivideByPO2(xnn_requantize_precise__sse4);
+      }
     }
   }
-}
 
-TEST(PRECISE__SSE4, divide_by_po2_with_rounding_down) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingDown(xnn_requantize_precise__sse4);
+  TEST(PRECISE__SSE4, divide_by_po2_with_rounding_up) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingUp(xnn_requantize_precise__sse4);
+      }
     }
   }
-}
 
-TEST(PRECISE__SSE4, divide_by_po2_with_rounding_away) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingAway(xnn_requantize_precise__sse4);
+  TEST(PRECISE__SSE4, divide_by_po2_with_rounding_down) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingDown(xnn_requantize_precise__sse4);
+      }
     }
   }
-}
 
-TEST(PRECISE__SSE4, special_cases) {
-  RequantizationTester()
-    .testSpecialCases(xnn_requantize_precise__sse4);
-}
+  TEST(PRECISE__SSE4, divide_by_po2_with_rounding_away) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingAway(xnn_requantize_precise__sse4);
+      }
+    }
+  }
 
-TEST(PRECISE__SSE4, random_cases) {
-  RequantizationTester()
-    .iterations(100)
-    .testRandomCasesPrecise(xnn_requantize_precise__sse4);
-}
-
-
-/*
- * FP32-based x86 SSE2 implementation.
- */
-
-TEST(FP32__SSE2, random_cases) {
-  RequantizationTester()
-    .iterations(1000)
-    .testRandomCasesApproximate(xnn_requantize_fp32__sse2);
-}
-
-
-/*
- * Q31-based x86 SSE2 implementation.
- */
-
-TEST(Q31__SSE2, exact_divide_by_po2) {
-  for (uint32_t s = 1; s < 32; s++) {
+  TEST(PRECISE__SSE4, special_cases) {
     RequantizationTester()
-      .s(s)
-      .testExactDivideByPO2(xnn_requantize_q31__sse2);
+      .testSpecialCases(xnn_requantize_precise__sse4);
   }
-}
 
-TEST(Q31__SSE2, exact_divide_by_po2_with_zero_point) {
-  for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+  TEST(PRECISE__SSE4, random_cases) {
+    RequantizationTester()
+      .iterations(100)
+      .testRandomCasesPrecise(xnn_requantize_precise__sse4);
+  }
+
+
+  /*
+   * FP32-based x86 SSE2 implementation.
+   */
+
+  TEST(FP32__SSE2, random_cases) {
+    RequantizationTester()
+      .iterations(1000)
+      .testRandomCasesApproximate(xnn_requantize_fp32__sse2);
+  }
+
+
+  /*
+   * Q31-based x86 SSE2 implementation.
+   */
+
+  TEST(Q31__SSE2, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
-        .zeroPoint(zeroPoint)
         .s(s)
         .testExactDivideByPO2(xnn_requantize_q31__sse2);
     }
   }
-}
 
-TEST(Q31__SSE2, divide_by_po2_with_rounding_up) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingUp(xnn_requantize_q31__sse2);
+  TEST(Q31__SSE2, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testExactDivideByPO2(xnn_requantize_q31__sse2);
+      }
     }
   }
-}
 
-/* No rounding down Test - it fails because of upward bias in multiplication */
-
-TEST(Q31__SSE2, divide_by_po2_with_rounding_away) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingAway(xnn_requantize_q31__sse2);
+  TEST(Q31__SSE2, divide_by_po2_with_rounding_up) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingUp(xnn_requantize_q31__sse2);
+      }
     }
   }
-}
 
-TEST(Q31__SSE2, special_cases) {
-  RequantizationTester()
-    .testSpecialCases(xnn_requantize_q31__sse2);
-}
+  /* No rounding down Test - it fails because of upward bias in multiplication */
 
-TEST(Q31__SSE2, random_cases) {
-  RequantizationTester()
-    .iterations(100)
-    .testRandomCasesApproximate(xnn_requantize_q31__sse2);
-}
+  TEST(Q31__SSE2, divide_by_po2_with_rounding_away) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingAway(xnn_requantize_q31__sse2);
+      }
+    }
+  }
 
-TEST(Q31__SSE2, random_match_gemmlowp) {
-  RequantizationTester()
-    .iterations(100)
-    .testRandomCasesAgainstReference(xnn_requantize_q31__sse2, xnn_requantize_gemmlowp__sse2);
-}
-
-
-/*
- * Q31-based x86 SSSE3 implementation.
- */
-
-TEST(Q31__SSSE3, exact_divide_by_po2) {
-  for (uint32_t s = 1; s < 32; s++) {
+  TEST(Q31__SSE2, special_cases) {
     RequantizationTester()
-      .s(s)
-      .testExactDivideByPO2(xnn_requantize_q31__ssse3);
+      .testSpecialCases(xnn_requantize_q31__sse2);
   }
-}
 
-TEST(Q31__SSSE3, exact_divide_by_po2_with_zero_point) {
-  for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+  TEST(Q31__SSE2, random_cases) {
+    RequantizationTester()
+      .iterations(100)
+      .testRandomCasesApproximate(xnn_requantize_q31__sse2);
+  }
+
+  TEST(Q31__SSE2, random_match_gemmlowp) {
+    RequantizationTester()
+      .iterations(100)
+      .testRandomCasesAgainstReference(xnn_requantize_q31__sse2, xnn_requantize_gemmlowp__sse2);
+  }
+
+
+  /*
+   * Q31-based x86 SSSE3 implementation.
+   */
+
+  TEST(Q31__SSSE3, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
-        .zeroPoint(zeroPoint)
         .s(s)
         .testExactDivideByPO2(xnn_requantize_q31__ssse3);
     }
   }
-}
 
-TEST(Q31__SSSE3, divide_by_po2_with_rounding_up) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingUp(xnn_requantize_q31__ssse3);
+  TEST(Q31__SSSE3, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testExactDivideByPO2(xnn_requantize_q31__ssse3);
+      }
     }
   }
-}
 
-/* No rounding down Test - it fails because of upward bias in multiplication */
-
-TEST(Q31__SSSE3, divide_by_po2_with_rounding_away) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingAway(xnn_requantize_q31__ssse3);
+  TEST(Q31__SSSE3, divide_by_po2_with_rounding_up) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingUp(xnn_requantize_q31__ssse3);
+      }
     }
   }
-}
 
-TEST(Q31__SSSE3, special_cases) {
-  RequantizationTester()
-    .testSpecialCases(xnn_requantize_q31__ssse3);
-}
+  /* No rounding down Test - it fails because of upward bias in multiplication */
 
-TEST(Q31__SSSE3, random_cases) {
-  RequantizationTester()
-    .iterations(100)
-    .testRandomCasesApproximate(xnn_requantize_q31__ssse3);
-}
+  TEST(Q31__SSSE3, divide_by_po2_with_rounding_away) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingAway(xnn_requantize_q31__ssse3);
+      }
+    }
+  }
 
-TEST(Q31__SSSE3, random_match_gemmlowp) {
-  RequantizationTester()
-    .iterations(100)
-    .testRandomCasesAgainstReference(xnn_requantize_q31__ssse3, xnn_requantize_gemmlowp__ssse3);
-}
-
-
-/*
- * Q31-based x86 SSE4 implementation.
- */
-
-TEST(Q31__SSE4, exact_divide_by_po2) {
-  for (uint32_t s = 1; s < 32; s++) {
+  TEST(Q31__SSSE3, special_cases) {
     RequantizationTester()
-      .s(s)
-      .testExactDivideByPO2(xnn_requantize_q31__sse4);
+      .testSpecialCases(xnn_requantize_q31__ssse3);
   }
-}
 
-TEST(Q31__SSE4, exact_divide_by_po2_with_zero_point) {
-  for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+  TEST(Q31__SSSE3, random_cases) {
+    RequantizationTester()
+      .iterations(100)
+      .testRandomCasesApproximate(xnn_requantize_q31__ssse3);
+  }
+
+  TEST(Q31__SSSE3, random_match_gemmlowp) {
+    RequantizationTester()
+      .iterations(100)
+      .testRandomCasesAgainstReference(xnn_requantize_q31__ssse3, xnn_requantize_gemmlowp__ssse3);
+  }
+
+
+  /*
+   * Q31-based x86 SSE4 implementation.
+   */
+
+  TEST(Q31__SSE4, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
-        .zeroPoint(zeroPoint)
         .s(s)
         .testExactDivideByPO2(xnn_requantize_q31__sse4);
     }
   }
-}
 
-TEST(Q31__SSE4, divide_by_po2_with_rounding_up) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingUp(xnn_requantize_q31__sse4);
+  TEST(Q31__SSE4, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testExactDivideByPO2(xnn_requantize_q31__sse4);
+      }
     }
   }
-}
 
-/* No rounding down Test - it fails because of upward bias in multiplication */
-
-TEST(Q31__SSE4, divide_by_po2_with_rounding_away) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingAway(xnn_requantize_q31__sse4);
+  TEST(Q31__SSE4, divide_by_po2_with_rounding_up) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingUp(xnn_requantize_q31__sse4);
+      }
     }
   }
-}
 
-TEST(Q31__SSE4, special_cases) {
-  RequantizationTester()
-    .testSpecialCases(xnn_requantize_q31__sse4);
-}
+  /* No rounding down Test - it fails because of upward bias in multiplication */
 
-TEST(Q31__SSE4, random_cases) {
-  RequantizationTester()
-    .iterations(100)
-    .testRandomCasesApproximate(xnn_requantize_q31__sse4);
-}
+  TEST(Q31__SSE4, divide_by_po2_with_rounding_away) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingAway(xnn_requantize_q31__sse4);
+      }
+    }
+  }
 
-TEST(Q31__SSE4, random_match_gemmlowp) {
-  RequantizationTester()
-    .iterations(100)
-    .testRandomCasesAgainstReference(xnn_requantize_q31__sse4, xnn_requantize_gemmlowp__sse4);
-}
-
-
-/*
- * x86 SSE2 implementation from gemmlowp.
- */
-
-TEST(GEMMLOWP__SSE2, exact_divide_by_po2) {
-  for (uint32_t s = 1; s < 32; s++) {
+  TEST(Q31__SSE4, special_cases) {
     RequantizationTester()
-      .s(s)
-      .testExactDivideByPO2(xnn_requantize_gemmlowp__sse2);
+      .testSpecialCases(xnn_requantize_q31__sse4);
   }
-}
 
-TEST(GEMMLOWP__SSE2, exact_divide_by_po2_with_zero_point) {
-  for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+  TEST(Q31__SSE4, random_cases) {
+    RequantizationTester()
+      .iterations(100)
+      .testRandomCasesApproximate(xnn_requantize_q31__sse4);
+  }
+
+  TEST(Q31__SSE4, random_match_gemmlowp) {
+    RequantizationTester()
+      .iterations(100)
+      .testRandomCasesAgainstReference(xnn_requantize_q31__sse4, xnn_requantize_gemmlowp__sse4);
+  }
+
+
+  /*
+   * x86 SSE2 implementation from gemmlowp.
+   */
+
+  TEST(GEMMLOWP__SSE2, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
-        .zeroPoint(zeroPoint)
         .s(s)
         .testExactDivideByPO2(xnn_requantize_gemmlowp__sse2);
     }
   }
-}
 
-TEST(GEMMLOWP__SSE2, divide_by_po2_with_rounding_up) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingUp(xnn_requantize_gemmlowp__sse2);
+  TEST(GEMMLOWP__SSE2, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testExactDivideByPO2(xnn_requantize_gemmlowp__sse2);
+      }
     }
   }
-}
 
-/* No rounding down Test - it fails because of upward bias in multiplication */
-
-TEST(GEMMLOWP__SSE2, divide_by_po2_with_rounding_away) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingAway(xnn_requantize_gemmlowp__sse2);
+  TEST(GEMMLOWP__SSE2, divide_by_po2_with_rounding_up) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingUp(xnn_requantize_gemmlowp__sse2);
+      }
     }
   }
-}
 
-TEST(GEMMLOWP__SSE2, special_cases) {
-  RequantizationTester()
-    .testSpecialCases(xnn_requantize_gemmlowp__sse2);
-}
+  /* No rounding down Test - it fails because of upward bias in multiplication */
 
-TEST(GEMMLOWP__SSE2, random_cases) {
-  RequantizationTester()
-    .iterations(100)
-    .testRandomCasesApproximate(xnn_requantize_gemmlowp__sse2);
-}
+  TEST(GEMMLOWP__SSE2, divide_by_po2_with_rounding_away) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingAway(xnn_requantize_gemmlowp__sse2);
+      }
+    }
+  }
 
-
-/*
- * x86 SSSE3 implementation from gemmlowp.
- */
-
-TEST(GEMMLOWP__SSSE3, exact_divide_by_po2) {
-  for (uint32_t s = 1; s < 32; s++) {
+  TEST(GEMMLOWP__SSE2, special_cases) {
     RequantizationTester()
-      .s(s)
-      .testExactDivideByPO2(xnn_requantize_gemmlowp__ssse3);
+      .testSpecialCases(xnn_requantize_gemmlowp__sse2);
   }
-}
 
-TEST(GEMMLOWP__SSSE3, exact_divide_by_po2_with_zero_point) {
-  for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+  TEST(GEMMLOWP__SSE2, random_cases) {
+    RequantizationTester()
+      .iterations(100)
+      .testRandomCasesApproximate(xnn_requantize_gemmlowp__sse2);
+  }
+
+
+  /*
+   * x86 SSSE3 implementation from gemmlowp.
+   */
+
+  TEST(GEMMLOWP__SSSE3, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
-        .zeroPoint(zeroPoint)
         .s(s)
         .testExactDivideByPO2(xnn_requantize_gemmlowp__ssse3);
     }
   }
-}
 
-TEST(GEMMLOWP__SSSE3, divide_by_po2_with_rounding_up) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingUp(xnn_requantize_gemmlowp__ssse3);
+  TEST(GEMMLOWP__SSSE3, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testExactDivideByPO2(xnn_requantize_gemmlowp__ssse3);
+      }
     }
   }
-}
 
-/* No rounding down Test - it fails because of upward bias in multiplication */
-
-TEST(GEMMLOWP__SSSE3, divide_by_po2_with_rounding_away) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingAway(xnn_requantize_gemmlowp__ssse3);
+  TEST(GEMMLOWP__SSSE3, divide_by_po2_with_rounding_up) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingUp(xnn_requantize_gemmlowp__ssse3);
+      }
     }
   }
-}
 
-TEST(GEMMLOWP__SSSE3, special_cases) {
-  RequantizationTester()
-    .testSpecialCases(xnn_requantize_gemmlowp__ssse3);
-}
+  /* No rounding down Test - it fails because of upward bias in multiplication */
 
-TEST(GEMMLOWP__SSSE3, random_cases) {
-  RequantizationTester()
-    .iterations(100)
-    .testRandomCasesApproximate(xnn_requantize_gemmlowp__ssse3);
-}
+  TEST(GEMMLOWP__SSSE3, divide_by_po2_with_rounding_away) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingAway(xnn_requantize_gemmlowp__ssse3);
+      }
+    }
+  }
 
-
-/*
- * x86 SSE4 implementation from gemmlowp.
- */
-
-TEST(GEMMLOWP__SSE4, exact_divide_by_po2) {
-  for (uint32_t s = 1; s < 32; s++) {
+  TEST(GEMMLOWP__SSSE3, special_cases) {
     RequantizationTester()
-      .s(s)
-      .testExactDivideByPO2(xnn_requantize_gemmlowp__sse4);
+      .testSpecialCases(xnn_requantize_gemmlowp__ssse3);
   }
-}
 
-TEST(GEMMLOWP__SSE4, exact_divide_by_po2_with_zero_point) {
-  for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+  TEST(GEMMLOWP__SSSE3, random_cases) {
+    RequantizationTester()
+      .iterations(100)
+      .testRandomCasesApproximate(xnn_requantize_gemmlowp__ssse3);
+  }
+
+
+  /*
+   * x86 SSE4 implementation from gemmlowp.
+   */
+
+  TEST(GEMMLOWP__SSE4, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
-        .zeroPoint(zeroPoint)
         .s(s)
         .testExactDivideByPO2(xnn_requantize_gemmlowp__sse4);
     }
   }
-}
 
-TEST(GEMMLOWP__SSE4, divide_by_po2_with_rounding_up) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingUp(xnn_requantize_gemmlowp__sse4);
+  TEST(GEMMLOWP__SSE4, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testExactDivideByPO2(xnn_requantize_gemmlowp__sse4);
+      }
     }
   }
-}
 
-/* No rounding down Test - it fails because of upward bias in multiplication */
-
-TEST(GEMMLOWP__SSE4, divide_by_po2_with_rounding_away) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingAway(xnn_requantize_gemmlowp__sse4);
+  TEST(GEMMLOWP__SSE4, divide_by_po2_with_rounding_up) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingUp(xnn_requantize_gemmlowp__sse4);
+      }
     }
   }
-}
 
-TEST(GEMMLOWP__SSE4, special_cases) {
-  RequantizationTester()
-    .testSpecialCases(xnn_requantize_gemmlowp__sse4);
-}
+  /* No rounding down Test - it fails because of upward bias in multiplication */
 
-TEST(GEMMLOWP__SSE4, random_cases) {
-  RequantizationTester()
-    .iterations(100)
-    .testRandomCasesApproximate(xnn_requantize_gemmlowp__sse4);
-}
+  TEST(GEMMLOWP__SSE4, divide_by_po2_with_rounding_away) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingAway(xnn_requantize_gemmlowp__sse4);
+      }
+    }
+  }
 
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-
-/*
- * Precise ARM NEON implementation.
- */
-
-TEST(PRECISE__NEON, exact_divide_by_po2) {
-  for (uint32_t s = 1; s < 32; s++) {
+  TEST(GEMMLOWP__SSE4, special_cases) {
     RequantizationTester()
-      .s(s)
-      .testExactDivideByPO2(xnn_requantize_precise__neon);
+      .testSpecialCases(xnn_requantize_gemmlowp__sse4);
   }
-}
 
-TEST(PRECISE__NEON, exact_divide_by_po2_with_zero_point) {
-  for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+  TEST(GEMMLOWP__SSE4, random_cases) {
+    RequantizationTester()
+      .iterations(100)
+      .testRandomCasesApproximate(xnn_requantize_gemmlowp__sse4);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  /*
+   * Precise ARM NEON implementation.
+   */
+
+  TEST(PRECISE__NEON, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
-        .zeroPoint(zeroPoint)
         .s(s)
         .testExactDivideByPO2(xnn_requantize_precise__neon);
     }
   }
-}
 
-TEST(PRECISE__NEON, divide_by_po2_with_rounding_up) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingUp(xnn_requantize_precise__neon);
+  TEST(PRECISE__NEON, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testExactDivideByPO2(xnn_requantize_precise__neon);
+      }
     }
   }
-}
 
-TEST(PRECISE__NEON, divide_by_po2_with_rounding_down) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingDown(xnn_requantize_precise__neon);
+  TEST(PRECISE__NEON, divide_by_po2_with_rounding_up) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingUp(xnn_requantize_precise__neon);
+      }
     }
   }
-}
 
-TEST(PRECISE__NEON, divide_by_po2_with_rounding_away) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingAway(xnn_requantize_precise__neon);
+  TEST(PRECISE__NEON, divide_by_po2_with_rounding_down) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingDown(xnn_requantize_precise__neon);
+      }
     }
   }
-}
 
-TEST(PRECISE__NEON, special_cases) {
-  RequantizationTester()
-    .testSpecialCases(xnn_requantize_precise__neon);
-}
+  TEST(PRECISE__NEON, divide_by_po2_with_rounding_away) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingAway(xnn_requantize_precise__neon);
+      }
+    }
+  }
 
-TEST(PRECISE__NEON, random_cases) {
-  RequantizationTester()
-    .iterations(100)
-    .testRandomCasesPrecise(xnn_requantize_precise__neon);
-}
-
-
-/*
- * FP32-based ARM NEON implementation.
- */
-
-TEST(FP32__NEON, random_cases) {
-  RequantizationTester()
-    .iterations(1000)
-    .testRandomCasesApproximate(xnn_requantize_fp32__neon);
-}
-
-
-/*
- * Q31-based ARM NEON implementation.
- */
-
-TEST(Q31__NEON, exact_divide_by_po2) {
-  for (uint32_t s = 1; s < 32; s++) {
+  TEST(PRECISE__NEON, special_cases) {
     RequantizationTester()
-      .s(s)
-      .testExactDivideByPO2(xnn_requantize_q31__neon);
+      .testSpecialCases(xnn_requantize_precise__neon);
   }
-}
 
-TEST(Q31__NEON, exact_divide_by_po2_with_zero_point) {
-  for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+  TEST(PRECISE__NEON, random_cases) {
+    RequantizationTester()
+      .iterations(100)
+      .testRandomCasesPrecise(xnn_requantize_precise__neon);
+  }
+
+
+  /*
+   * FP32-based ARM NEON implementation.
+   */
+
+  TEST(FP32__NEON, random_cases) {
+    RequantizationTester()
+      .iterations(1000)
+      .testRandomCasesApproximate(xnn_requantize_fp32__neon);
+  }
+
+
+  /*
+   * Q31-based ARM NEON implementation.
+   */
+
+  TEST(Q31__NEON, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
-        .zeroPoint(zeroPoint)
         .s(s)
         .testExactDivideByPO2(xnn_requantize_q31__neon);
     }
   }
-}
 
-TEST(Q31__NEON, divide_by_po2_with_rounding_up) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingUp(xnn_requantize_q31__neon);
+  TEST(Q31__NEON, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zeroPoint = 1; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testExactDivideByPO2(xnn_requantize_q31__neon);
+      }
     }
   }
-}
 
-/* No rounding down Test - it fails because of upward bias in multiplication */
-
-TEST(Q31__NEON, divide_by_po2_with_rounding_away) {
-  for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zeroPoint(zeroPoint)
-        .s(s)
-        .testDivideByPO2WithRoundingAway(xnn_requantize_q31__neon);
+  TEST(Q31__NEON, divide_by_po2_with_rounding_up) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingUp(xnn_requantize_q31__neon);
+      }
     }
   }
-}
 
-TEST(Q31__NEON, special_cases) {
-  RequantizationTester()
-    .testSpecialCases(xnn_requantize_q31__neon);
-}
+  /* No rounding down Test - it fails because of upward bias in multiplication */
 
-TEST(Q31__NEON, random_cases) {
-  RequantizationTester()
-    .iterations(100)
-    .testRandomCasesApproximate(xnn_requantize_q31__neon);
-}
+  TEST(Q31__NEON, divide_by_po2_with_rounding_away) {
+    for (int32_t zeroPoint = 0; zeroPoint < 256; zeroPoint++) {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zeroPoint(zeroPoint)
+          .s(s)
+          .testDivideByPO2WithRoundingAway(xnn_requantize_q31__neon);
+      }
+    }
+  }
 
-TEST(Q31__NEON, random_match_gemmlowp) {
-  RequantizationTester()
-    .iterations(100)
-    .testRandomCasesAgainstReference(xnn_requantize_q31__neon, xnn_requantize_gemmlowp__neon);
-}
+  TEST(Q31__NEON, special_cases) {
+    RequantizationTester()
+      .testSpecialCases(xnn_requantize_q31__neon);
+  }
+
+  TEST(Q31__NEON, random_cases) {
+    RequantizationTester()
+      .iterations(100)
+      .testRandomCasesApproximate(xnn_requantize_q31__neon);
+  }
+
+  TEST(Q31__NEON, random_match_gemmlowp) {
+    RequantizationTester()
+      .iterations(100)
+      .testRandomCasesAgainstReference(xnn_requantize_q31__neon, xnn_requantize_gemmlowp__neon);
+  }
 
 
-/*
- * ARM NEON implementation from gemmlowp.
- */
+  /*
+   * ARM NEON implementation from gemmlowp.
+   */
 
-TEST(GEMMLOWP__NEON, random_cases) {
-  RequantizationTester()
-    .iterations(100)
-    .testRandomCasesApproximate(xnn_requantize_gemmlowp__neon);
-}
-
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+  TEST(GEMMLOWP__NEON, random_cases) {
+    RequantizationTester()
+      .iterations(100)
+      .testRandomCasesApproximate(xnn_requantize_gemmlowp__neon);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
diff --git a/test/u8-clamp.cc b/test/u8-clamp.cc
index b20a76f..4fd6563 100644
--- a/test/u8-clamp.cc
+++ b/test/u8-clamp.cc
@@ -6,16 +6,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/clamp.h>
 
+#include <xnnpack/clamp.h>
 #include "clamp-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(U8_CLAMP__NEON, n_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     ClampMicrokernelTester()
@@ -88,9 +88,9 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(U8_CLAMP__SSE2, n_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     ClampMicrokernelTester()
@@ -163,7 +163,7 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 TEST(U8_CLAMP__SCALAR, n_eq_1) {
   ClampMicrokernelTester()
diff --git a/test/u8-lut32norm.cc b/test/u8-lut32norm.cc
index f3b1f80..959839d 100644
--- a/test/u8-lut32norm.cc
+++ b/test/u8-lut32norm.cc
@@ -9,7 +9,6 @@
 #include <gtest/gtest.h>
 
 #include <xnnpack/lut.h>
-
 #include "lut-norm-microkernel-tester.h"
 
 
diff --git a/test/u8-maxpool.cc b/test/u8-maxpool.cc
index e2bee52..b5832dc 100644
--- a/test/u8-maxpool.cc
+++ b/test/u8-maxpool.cc
@@ -6,16 +6,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/maxpool.h>
 
+#include <xnnpack/maxpool.h>
 #include "maxpool-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_unipass_fulltile) {
     TEST_REQUIRES_ARM_NEON;
     auto tester = MaxPoolMicrokernelTester()
@@ -1217,9 +1217,9 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_unipass_fulltile) {
     TEST_REQUIRES_X86_SSE2;
     auto tester = MaxPoolMicrokernelTester()
@@ -2421,7 +2421,7 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_unipass_fulltile) {
   auto tester = MaxPoolMicrokernelTester()
diff --git a/test/u8-rmax.cc b/test/u8-rmax.cc
index ad915ad..8f6b03f 100644
--- a/test/u8-rmax.cc
+++ b/test/u8-rmax.cc
@@ -6,16 +6,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/rmax.h>
 
+#include <xnnpack/rmax.h>
 #include "rmax-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(U8RMAX__NEON, n_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t n = 1; n < 16; n++) {
@@ -49,9 +49,9 @@
         .Test(xnn_u8_rmax_ukernel__neon);
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(U8RMAX__SSE2, n_lt_16) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t n = 1; n < 16; n++) {
@@ -85,7 +85,7 @@
         .Test(xnn_u8_rmax_ukernel__sse2);
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 TEST(U8RMAX__SCALAR, n_lt_2) {
   for (size_t n = 1; n < 2; n++) {
diff --git a/test/x32-packx.cc b/test/x32-packx.cc
index ce4e428..d2a5f65 100644
--- a/test/x32-packx.cc
+++ b/test/x32-packx.cc
@@ -8,16 +8,16 @@
 //   Generator: tools/generate-pack-test.py
 
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
-#include <xnnpack/packx.h>
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
 
+#include <xnnpack/packx.h>
 #include "pack-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(X32_PACKX_4X__NEON_ST4, k_eq_4) {
     TEST_REQUIRES_ARM_NEON;
     PackMicrokernelTester()
@@ -121,10 +121,10 @@
         .Test(xnn_x32_packx_ukernel_4x__neon_st4);
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(X32_PACKX_4X__SSE, k_eq_4) {
     TEST_REQUIRES_X86_SSE;
     PackMicrokernelTester()
@@ -228,10 +228,10 @@
         .Test(xnn_x32_packx_ukernel_4x__sse);
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(X32_PACKX_4X__PSIMD, k_eq_4) {
     TEST_REQUIRES_PSIMD;
     PackMicrokernelTester()
@@ -335,7 +335,7 @@
         .Test(xnn_x32_packx_ukernel_4x__psimd);
     }
   }
-#endif  // !CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
 TEST(X32_PACKX_2X__SCALAR, k_eq_1) {
diff --git a/test/x32-pad.cc b/test/x32-pad.cc
index 615deaf..793a7f8 100644
--- a/test/x32-pad.cc
+++ b/test/x32-pad.cc
@@ -5,13 +5,14 @@
 
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/pad.h>
 
+#include <xnnpack/pad.h>
 #include "pad-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(X32_PAD_X2__NEON, fulltile_copy_n_eq_4) {
     TEST_REQUIRES_ARM_NEON;
     PadMicrokernelTester()
@@ -201,10 +202,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(X32_PAD_X2__SSE2, fulltile_copy_n_eq_4) {
     TEST_REQUIRES_X86_SSE2;
     PadMicrokernelTester()
@@ -394,10 +395,10 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
   TEST(X32_PAD_X2__PSIMD, fulltile_copy_n_eq_4) {
     TEST_REQUIRES_PSIMD;
     PadMicrokernelTester()
@@ -587,7 +588,7 @@
       }
     }
   }
-#endif  // !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
 
 TEST(X32_PAD_X2__SCALAR, fulltile_copy_n_eq_1) {
diff --git a/test/x32-unpool.cc b/test/x32-unpool.cc
index 04dd20f..dd9c17e 100644
--- a/test/x32-unpool.cc
+++ b/test/x32-unpool.cc
@@ -5,13 +5,14 @@
 
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/unpool.h>
 
+#include <xnnpack/unpool.h>
 #include "unpool-microkernel-tester.h"
 
 
-#if !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
   TEST(X32_UNPOOL__PSIMD, c_eq_4) {
     TEST_REQUIRES_PSIMD;
     UnpoolMicrokernelTester()
@@ -83,7 +84,7 @@
         .Test(xnn_x32_unpool_ukernel__psimd);
     }
   }
-#endif  // !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
 
 TEST(X32_UNPOOL__SCALAR, c_eq_1) {
diff --git a/test/x32-zip.cc b/test/x32-zip.cc
index 7fe9dc7..729bcba 100644
--- a/test/x32-zip.cc
+++ b/test/x32-zip.cc
@@ -3,16 +3,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/zip.h>
 
+#include <xnnpack/zip.h>
 #include "zip-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(X32_ZIP_X2__NEON, n_eq_4) {
     TEST_REQUIRES_ARM_NEON;
     ZipMicrokernelTester()
@@ -256,9 +256,9 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(X32_ZIP_X2__SSE2, n_eq_4) {
     TEST_REQUIRES_X86_SSE2;
     ZipMicrokernelTester()
@@ -502,9 +502,9 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
-#if !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
   TEST(X32_ZIP_X2__PSIMD, n_eq_4) {
     TEST_REQUIRES_PSIMD;
     ZipMicrokernelTester()
@@ -748,7 +748,7 @@
       }
     }
   }
-#endif  // !CPUINFO_ARCH_WASM && !CPUINFO_ARCH_ASMJS
+#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
 TEST(X32_ZIP_X2__SCALAR, n_eq_1) {
   ZipMicrokernelTester()
diff --git a/test/x8-lut.cc b/test/x8-lut.cc
index aacbad2..4acfec8 100644
--- a/test/x8-lut.cc
+++ b/test/x8-lut.cc
@@ -9,7 +9,6 @@
 #include <gtest/gtest.h>
 
 #include <xnnpack/lut.h>
-
 #include "lut-microkernel-tester.h"
 
 
diff --git a/test/x8-zip.cc b/test/x8-zip.cc
index 8fe18b4..c92ee35 100644
--- a/test/x8-zip.cc
+++ b/test/x8-zip.cc
@@ -6,16 +6,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
-#include <xnnpack/zip.h>
 
+#include <xnnpack/zip.h>
 #include "zip-microkernel-tester.h"
 
 
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(X8_ZIP_X2__NEON, n_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     ZipMicrokernelTester()
@@ -237,9 +237,9 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(X8_ZIP_X2__SSE2, n_eq_16) {
     TEST_REQUIRES_X86_SSE2;
     ZipMicrokernelTester()
@@ -489,7 +489,7 @@
       }
     }
   }
-#endif  // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 TEST(X8_ZIP_X2__SCALAR, n_eq_1) {
   ZipMicrokernelTester()
diff --git a/tools/generate-dwconv-test.py b/tools/generate-dwconv-test.py
index 8ee4049..6ce276d 100755
--- a/tools/generate-dwconv-test.py
+++ b/tools/generate-dwconv-test.py
@@ -41,10 +41,10 @@
 
 
 ARCH_TO_MACRO_MAP = {
-  "aarch32": "CPUINFO_ARCH_ARM",
-  "aarch64": "CPUINFO_ARCH_ARM64",
-  "x86": "CPUINFO_ARCH_X86",
-  "x86-64": "CPUINFO_ARCH_X86_64",
+  "aarch32": "XNN_ARCH_ARM",
+  "aarch64": "XNN_ARCH_ARM64",
+  "x86": "XNN_ARCH_X86",
+  "x86-64": "XNN_ARCH_X86_64",
 }
 
 ISA_TO_ARCH_MAP = {
@@ -365,12 +365,12 @@
 //   Generator: {generator}
 
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
-#include <xnnpack/dwconv.h>
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
 
+#include <xnnpack/dwconv.h>
 #include "dwconv-microkernel-tester.h"
 """.format(specification=options.spec, generator=sys.argv[0])
 
@@ -391,7 +391,7 @@
         tests += indent(test_case) + "\n"
         tests += "#endif  // %s\n" % guard_macro
       elif isa == "psimd":
-        guard_macro = "!CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM"
+        guard_macro = "!XNN_ARCH_ASMJS && !XNN_ARCH_WASM"
         tests += "#if %s\n" % guard_macro
         tests += indent(test_case) + "\n"
         tests += "#endif  // %s\n" % guard_macro
diff --git a/tools/generate-gemm-test.py b/tools/generate-gemm-test.py
index 249f77f..8fcc0bd 100755
--- a/tools/generate-gemm-test.py
+++ b/tools/generate-gemm-test.py
@@ -40,10 +40,10 @@
 
 
 ARCH_TO_MACRO_MAP = {
-  "aarch32": "CPUINFO_ARCH_ARM",
-  "aarch64": "CPUINFO_ARCH_ARM64",
-  "x86": "CPUINFO_ARCH_X86",
-  "x86-64": "CPUINFO_ARCH_X86_64",
+  "aarch32": "XNN_ARCH_ARM",
+  "aarch64": "XNN_ARCH_ARM64",
+  "x86": "XNN_ARCH_X86",
+  "x86-64": "XNN_ARCH_X86_64",
 }
 
 ISA_TO_ARCH_MAP = {
@@ -875,14 +875,14 @@
 //   Generator: {generator}
 
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
 #include <xnnpack/gemm.h>
 #include <xnnpack/igemm.h>
 #include <xnnpack/ppmm.h>
-#include <xnnpack/isa-checks.h>
-
 #include "gemm-microkernel-tester.h"
 """.format(specification=options.spec, generator=sys.argv[0])
 
@@ -905,7 +905,7 @@
         tests += indent(test_case) + "\n"
         tests += "#endif  // %s\n" % guard_macro
       elif isa == "psimd":
-        guard_macro = "!CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM"
+        guard_macro = "!XNN_ARCH_ASMJS && !XNN_ARCH_WASM"
         tests += "#if %s\n" % guard_macro
         tests += indent(test_case) + "\n"
         tests += "#endif  // %s\n" % guard_macro
diff --git a/tools/generate-pack-test.py b/tools/generate-pack-test.py
index 38cca3e..7f03bca 100755
--- a/tools/generate-pack-test.py
+++ b/tools/generate-pack-test.py
@@ -40,10 +40,10 @@
 
 
 ARCH_TO_MACRO_MAP = {
-  "aarch32": "CPUINFO_ARCH_ARM",
-  "aarch64": "CPUINFO_ARCH_ARM64",
-  "x86": "CPUINFO_ARCH_X86",
-  "x86-64": "CPUINFO_ARCH_X86_64",
+  "aarch32": "XNN_ARCH_ARM",
+  "aarch64": "XNN_ARCH_ARM64",
+  "x86": "XNN_ARCH_X86",
+  "x86-64": "XNN_ARCH_X86_64",
 }
 
 ISA_TO_ARCH_MAP = {
@@ -252,12 +252,12 @@
 //   Generator: {generator}
 
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
-#include <xnnpack/packx.h>
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
 
+#include <xnnpack/packx.h>
 #include "pack-microkernel-tester.h"
 """.format(specification=options.spec, generator=sys.argv[0])
 
@@ -278,7 +278,7 @@
         tests += indent(test_case) + "\n"
         tests += "#endif  // %s\n" % guard_macro
       elif isa == "psimd":
-        guard_macro = "!CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM"
+        guard_macro = "!XNN_ARCH_ASMJS && !XNN_ARCH_WASM"
         tests += "#if %s\n" % guard_macro
         tests += indent(test_case) + "\n"
         tests += "#endif  // %s\n" % guard_macro
diff --git a/tools/generate-spmm-test.py b/tools/generate-spmm-test.py
index 62434a8..6835f87 100755
--- a/tools/generate-spmm-test.py
+++ b/tools/generate-spmm-test.py
@@ -40,10 +40,10 @@
 
 
 ARCH_TO_MACRO_MAP = {
-  "aarch32": "CPUINFO_ARCH_ARM",
-  "aarch64": "CPUINFO_ARCH_ARM64",
-  "x86": "CPUINFO_ARCH_X86",
-  "x86-64": "CPUINFO_ARCH_X86_64",
+  "aarch32": "XNN_ARCH_ARM",
+  "aarch64": "XNN_ARCH_ARM64",
+  "x86": "XNN_ARCH_X86",
+  "x86-64": "XNN_ARCH_X86_64",
 }
 
 ISA_TO_ARCH_MAP = {
@@ -468,12 +468,12 @@
 //   Generator: {generator}
 
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
-#include <xnnpack/spmm.h>
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
 
+#include <xnnpack/spmm.h>
 #include "spmm-microkernel-tester.h"
 """.format(specification=options.spec, generator=sys.argv[0])
 
@@ -495,7 +495,7 @@
         tests += indent(test_case) + "\n"
         tests += "#endif  // %s\n" % guard_macro
       elif isa == "psimd":
-        guard_macro = "!CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM"
+        guard_macro = "!XNN_ARCH_ASMJS && !XNN_ARCH_WASM"
         tests += "#if %s\n" % guard_macro
         tests += indent(test_case) + "\n"
         tests += "#endif  // %s\n" % guard_macro
diff --git a/tools/generate-vmulcaddc-test.py b/tools/generate-vmulcaddc-test.py
index 6fbca26..4d6a402 100755
--- a/tools/generate-vmulcaddc-test.py
+++ b/tools/generate-vmulcaddc-test.py
@@ -41,10 +41,10 @@
 
 
 ARCH_TO_MACRO_MAP = {
-  "aarch32": "CPUINFO_ARCH_ARM",
-  "aarch64": "CPUINFO_ARCH_ARM64",
-  "x86": "CPUINFO_ARCH_X86",
-  "x86-64": "CPUINFO_ARCH_X86_64",
+  "aarch32": "XNN_ARCH_ARM",
+  "aarch64": "XNN_ARCH_ARM64",
+  "x86": "XNN_ARCH_X86",
+  "x86-64": "XNN_ARCH_X86_64",
 }
 
 ISA_TO_ARCH_MAP = {
@@ -295,12 +295,12 @@
 //   Generator: {generator}
 
 
-#include <cpuinfo.h>
 #include <gtest/gtest.h>
 
-#include <xnnpack/vmulcaddc.h>
+#include <xnnpack/common.h>
 #include <xnnpack/isa-checks.h>
 
+#include <xnnpack/vmulcaddc.h>
 #include "vmulcaddc-microkernel-tester.h"
 """.format(specification=options.spec, generator=sys.argv[0])
 
@@ -321,7 +321,7 @@
         tests += indent(test_case) + "\n"
         tests += "#endif  // %s\n" % guard_macro
       elif isa == "psimd":
-        guard_macro = "!CPUINFO_ARCH_ASMJS && !CPUINFO_ARCH_WASM"
+        guard_macro = "!XNN_ARCH_ASMJS && !XNN_ARCH_WASM"
         tests += "#if %s\n" % guard_macro
         tests += indent(test_case) + "\n"
         tests += "#endif  // %s\n" % guard_macro