Use ISA-specific layouts in F32 [I]GEMM & DWCONV microkernels
PiperOrigin-RevId: 375547949
diff --git a/src/f32-gemm/avx-broadcast.c.in b/src/f32-gemm/avx-broadcast.c.in
index cc697c4..8e9699e 100644
--- a/src/f32-gemm/avx-broadcast.c.in
+++ b/src/f32-gemm/avx-broadcast.c.in
@@ -94,16 +94,16 @@
k -= sizeof(float);
} while (k != 0);
- const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
- $for N in range(0, NR, 8):
- $for M in range(MR):
- vacc${M}x${ABC[N:N+8]} = _mm256_min_ps(vacc${M}x${ABC[N:N+8]}, vmax);
-
- const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+ const __m256 vmin = _mm256_load_ps(params->avx.min);
$for N in range(0, NR, 8):
$for M in range(MR):
vacc${M}x${ABC[N:N+8]} = _mm256_max_ps(vacc${M}x${ABC[N:N+8]}, vmin);
+ const __m256 vmax = _mm256_load_ps(params->avx.max);
+ $for N in range(0, NR, 8):
+ $for M in range(MR):
+ vacc${M}x${ABC[N:N+8]} = _mm256_min_ps(vacc${M}x${ABC[N:N+8]}, vmax);
+
if XNN_LIKELY(nc >= ${NR}) {
$for M in reversed(range(MR)):
_mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]});