Use ISA-specific layouts in F32 [I]GEMM & DWCONV microkernels

PiperOrigin-RevId: 375547949
diff --git a/src/f32-gemm/avx-broadcast.c.in b/src/f32-gemm/avx-broadcast.c.in
index cc697c4..8e9699e 100644
--- a/src/f32-gemm/avx-broadcast.c.in
+++ b/src/f32-gemm/avx-broadcast.c.in
@@ -94,16 +94,16 @@
       k -= sizeof(float);
     } while (k != 0);
 
-    const __m256 vmax = _mm256_broadcast_ps((const __m128*) params->sse.max);
-    $for N in range(0, NR, 8):
-      $for M in range(MR):
-        vacc${M}x${ABC[N:N+8]} = _mm256_min_ps(vacc${M}x${ABC[N:N+8]}, vmax);
-
-    const __m256 vmin = _mm256_broadcast_ps((const __m128*) params->sse.min);
+    const __m256 vmin = _mm256_load_ps(params->avx.min);
     $for N in range(0, NR, 8):
       $for M in range(MR):
         vacc${M}x${ABC[N:N+8]} = _mm256_max_ps(vacc${M}x${ABC[N:N+8]}, vmin);
 
+    const __m256 vmax = _mm256_load_ps(params->avx.max);
+    $for N in range(0, NR, 8):
+      $for M in range(MR):
+        vacc${M}x${ABC[N:N+8]} = _mm256_min_ps(vacc${M}x${ABC[N:N+8]}, vmax);
+
     if XNN_LIKELY(nc >= ${NR}) {
       $for M in reversed(range(MR)):
         _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]});