Unify layout of min/max parameters

PiperOrigin-RevId: 305575364
diff --git a/src/f32-dwconv/up4x9-aarch64-neonfma-cortex-a55.S b/src/f32-dwconv/up4x9-aarch64-neonfma-cortex-a55.S
index df24d0f..bd75792 100644
--- a/src/f32-dwconv/up4x9-aarch64-neonfma-cortex-a55.S
+++ b/src/f32-dwconv/up4x9-aarch64-neonfma-cortex-a55.S
@@ -286,8 +286,8 @@
           # Load vi6_next.lo
           LD1 {v16.2S}, [x13], 8
 
-          # vacc.lo = min(vacc.lo, vmax)
-          FMIN v0.2S, v0.2S, v30.2S
+          # vacc.lo = min(vacc.lo, vmin)
+          FMAX v0.2S, v0.2S, v30.2S
           # Load vk6_next.lo
           LD1 {v17.2S}, [x17], 8
 
@@ -296,8 +296,8 @@
           # Load vi6_next.hi
           LD1 {v18.2S}, [x13], 8
 
-          # vacc.hi = min(vacc.hi, vmax)
-          FMIN v1.2S, v1.2S, v30.2S
+          # vacc.hi = min(vacc.hi, vmin)
+          FMAX v1.2S, v1.2S, v30.2S
           # Load vk6_next.hi
           LD1 {v19.2S}, [x17], 8
 
@@ -306,8 +306,8 @@
           # Load vi7_next.lo
           LD1 {v20.2S}, [x14], 8
 
-          # vacc.lo = max(vacc.lo, vmin)
-          FMAX v0.2S, v0.2S, v31.2S
+          # vacc.lo = max(vacc.lo, vmax)
+          FMIN v0.2S, v0.2S, v31.2S
           # Load vk7_next.lo
           LD1 {v21.2S}, [x17], 8
 
@@ -316,8 +316,8 @@
           # Load vi7_next.hi
           LD1 {v22.2S}, [x14], 8
 
-          # vacc.hi = max(vacc.hi, vmin)
-          FMAX v1.2S, v1.2S, v31.2S
+          # vacc.hi = max(vacc.hi, vmax)
+          FMIN v1.2S, v1.2S, v31.2S
           # Load vk7_next.hi
           LD1 {v23.2S}, [x17], 8
 
@@ -454,8 +454,8 @@
             # Load vi6.lo
             LD1 {v4.2S}, [x13], 8
 
-            # vacc_prev.lo = min(vacc_prev.lo, vmax)
-            FMIN v2.2S, v2.2S, v30.2S
+            # vacc_prev.lo = min(vacc_prev.lo, vmin)
+            FMAX v2.2S, v2.2S, v30.2S
             # Load vk6.lo
             LD1 {v5.2S}, [x17], 8
 
@@ -464,8 +464,8 @@
             # Load vi6.hi
             LD1 {v6.2S}, [x13], 8
 
-            # vacc_prev.hi = min(vacc_prev.hi, vmax)
-            FMIN v3.2S, v3.2S, v30.2S
+            # vacc_prev.hi = min(vacc_prev.hi, vmin)
+            FMAX v3.2S, v3.2S, v30.2S
             # Load vk6.hi
             LD1 {v7.2S}, [x17], 8
 
@@ -474,8 +474,8 @@
             # Load vi7.lo
             LD1 {v28.2S}, [x14], 8
 
-            # vacc_prev.lo = max(vacc_prev.lo, vmin)
-            FMAX v2.2S, v2.2S, v31.2S
+            # vacc_prev.lo = max(vacc_prev.lo, vmax)
+            FMIN v2.2S, v2.2S, v31.2S
             # Load vk7.lo
             LD1 {v29.2S}, [x17], 8
 
@@ -484,8 +484,8 @@
             # Load vi7.hi
             LD1 {v10.2S}, [x14], 8
 
-            # vacc_prev.lo = max(vacc_prev.lo, vmin)
-            FMAX v3.2S, v3.2S, v31.2S
+            # vacc_prev.lo = max(vacc_prev.lo, vmax)
+            FMIN v3.2S, v3.2S, v31.2S
             # Load vk7.hi
             LD1 {v11.2S}, [x17], 8
 
@@ -615,8 +615,8 @@
             # Load vi6_next.lo
             LD1 {v16.2S}, [x13], 8
 
-            # vacc.lo = min(vacc.lo, vmax)
-            FMIN v0.2S, v0.2S, v30.2S
+            # vacc.lo = min(vacc.lo, vmin)
+            FMAX v0.2S, v0.2S, v30.2S
             # Load vk6_next.lo
             LD1 {v17.2S}, [x17], 8
 
@@ -625,8 +625,8 @@
             # Load vi6_next.hi
             LD1 {v18.2S}, [x13], 8
 
-            # vacc.hi = min(vacc.hi, vmax)
-            FMIN v1.2S, v1.2S, v30.2S
+            # vacc.hi = min(vacc.hi, vmin)
+            FMAX v1.2S, v1.2S, v30.2S
             # Load vk6_next.hi
             LD1 {v19.2S}, [x17], 8
 
@@ -635,8 +635,8 @@
             # Load vi7_next.lo
             LD1 {v20.2S}, [x14], 8
 
-            # vacc.lo = max(vacc.lo, vmin)
-            FMAX v0.2S, v0.2S, v31.2S
+            # vacc.lo = max(vacc.lo, vmax)
+            FMIN v0.2S, v0.2S, v31.2S
             # Load vk7_next.lo
             LD1 {v21.2S}, [x17], 8
 
@@ -645,8 +645,8 @@
             # Load vi7_next.hi
             LD1 {v22.2S}, [x14], 8
 
-            # vacc.hi = max(vacc.hi, vmin)
-            FMAX v1.2S, v1.2S, v31.2S
+            # vacc.hi = max(vacc.hi, vmax)
+            FMIN v1.2S, v1.2S, v31.2S
             # Load vk7_next.hi
             LD1 {v23.2S}, [x17], 8
 
@@ -712,17 +712,17 @@
           # vacc_prev.hi += vi8_prev.hi * vk8_prev.hi
           FMLA v3.2S, v26.2S, v27.2S
 
-          # vacc_prev.lo = min(vacc_prev.lo, vmax)
-          FMIN v2.2S, v2.2S, v30.2S
+          # vacc_prev.lo = min(vacc_prev.lo, vmin)
+          FMAX v2.2S, v2.2S, v30.2S
 
-          # vacc_prev.hi = min(vacc_prev.hi, vmax)
-          FMIN v3.2S, v3.2S, v30.2S
+          # vacc_prev.hi = min(vacc_prev.hi, vmin)
+          FMAX v3.2S, v3.2S, v30.2S
 
-          # vacc_prev.lo = max(vacc_prev.lo, vmin)
-          FMAX v2.2S, v2.2S, v31.2S
+          # vacc_prev.lo = max(vacc_prev.lo, vmax)
+          FMIN v2.2S, v2.2S, v31.2S
 
-          # vacc_prev.lo = max(vacc_prev.lo, vmin)
-          FMAX v3.2S, v3.2S, v31.2S
+          # vacc_prev.lo = max(vacc_prev.lo, vmax)
+          FMIN v3.2S, v3.2S, v31.2S
 
           # Store vacc_prev
           STP d2, d3, [x4], 16
@@ -756,8 +756,8 @@
         FMLA v0.4S, v28.4S, v17.4S
         FMLA v0.4S, v29.4S, v18.4S
 
-        FMIN v0.4S, v0.4S, v30.4S
-        FMAX v0.4S, v0.4S, v31.4S
+        FMAX v0.4S, v0.4S, v30.4S
+        FMIN v0.4S, v0.4S, v31.4S
 
         STR q0, [x4], 16
 
@@ -792,8 +792,8 @@
         FMLA v0.4S, v28.4S, v17.4S
         FMLA v0.4S, v29.4S, v18.4S
 
-        FMIN v0.4S, v0.4S, v30.4S
-        FMAX v0.4S, v0.4S, v31.4S
+        FMAX v0.4S, v0.4S, v30.4S
+        FMIN v0.4S, v0.4S, v31.4S
 
         TBZ x16, 1, 5f
 
diff --git a/src/f32-dwconv/up4x9-aarch64-neonfma.S b/src/f32-dwconv/up4x9-aarch64-neonfma.S
index 817eaec..56b90c5 100644
--- a/src/f32-dwconv/up4x9-aarch64-neonfma.S
+++ b/src/f32-dwconv/up4x9-aarch64-neonfma.S
@@ -77,8 +77,8 @@
         FMLA v0.4S, v8.4S, v17.4S
         FMLA v0.4S, v9.4S, v18.4S
 
-        FMIN v0.4S, v0.4S, v30.4S
-        FMAX v0.4S, v0.4S, v31.4S
+        FMAX v0.4S, v0.4S, v30.4S
+        FMIN v0.4S, v0.4S, v31.4S
 
         STR q0, [x4], 16
         SUBS x16, x16, 4
@@ -115,8 +115,8 @@
         FMLA v0.4S, v8.4S, v17.4S
         FMLA v0.4S, v9.4S, v18.4S
 
-        FMIN v0.4S, v0.4S, v30.4S
-        FMAX v0.4S, v0.4S, v31.4S
+        FMAX v0.4S, v0.4S, v30.4S
+        FMIN v0.4S, v0.4S, v31.4S
 
         TBZ x16, 1, 3f
 
diff --git a/src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in b/src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in
index 55ba772..8f31635 100644
--- a/src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in
+++ b/src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in
@@ -275,13 +275,13 @@
 
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v30.4s
+        FMAX v20.4s, v20.4s, v30.4s
         SUBS x1, x1, 12
-        FMIN v21.4s, v21.4s, v30.4s
-        FMIN v22.4s, v22.4s, v30.4s
-        FMAX v20.4s, v20.4s, v31.4s
-        FMAX v21.4s, v21.4s, v31.4s
-        FMAX v22.4s, v22.4s, v31.4s
+        FMAX v21.4s, v21.4s, v30.4s
+        FMAX v22.4s, v22.4s, v30.4s
+        FMIN v20.4s, v20.4s, v31.4s
+        FMIN v21.4s, v21.4s, v31.4s
+        FMIN v22.4s, v22.4s, v31.4s
 
         # Store full 1 x 12
         B.LO 7f
diff --git a/src/f32-gemm/1x8-aarch64-neonfma-cortex-a53.S.in b/src/f32-gemm/1x8-aarch64-neonfma-cortex-a53.S.in
index e88a09b..deee057 100644
--- a/src/f32-gemm/1x8-aarch64-neonfma-cortex-a53.S.in
+++ b/src/f32-gemm/1x8-aarch64-neonfma-cortex-a53.S.in
@@ -166,11 +166,11 @@
         FADD v17.4s, v17.4s, v19.4s
 
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
+        FMAX v16.4s, v16.4s, v4.4s
         SUBS x1, x1, 8
-        FMIN v17.4s, v17.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
 
         # Store full 1 x 8
         B.LO 9f
diff --git a/src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S.in b/src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S.in
index 19491c7..6ee3699 100644
--- a/src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S.in
+++ b/src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S.in
@@ -159,10 +159,10 @@
         FADD v17.4s, v17.4s, v19.4s
 
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
-        FMIN v17.4s, v17.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
+        FMAX v16.4s, v16.4s, v4.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
 
         # Store full 1 x 8
         B.LO 9f
diff --git a/src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in b/src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in
index d0bb021..bd806c8 100644
--- a/src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in
+++ b/src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in
@@ -429,31 +429,31 @@
 
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v4.4s
+        FMAX v20.4s, v20.4s, v4.4s
         SUBS x1, x1, 12
-        FMIN v21.4s, v21.4s, v4.4s
-        FMIN v22.4s, v22.4s, v4.4s
-        FMIN v23.4s, v23.4s, v4.4s
-        FMIN v24.4s, v24.4s, v4.4s
-        FMIN v25.4s, v25.4s, v4.4s
-        FMIN v26.4s, v26.4s, v4.4s
-        FMIN v27.4s, v27.4s, v4.4s
-        FMIN v28.4s, v28.4s, v4.4s
-        FMIN v29.4s, v29.4s, v4.4s
-        FMIN v30.4s, v30.4s, v4.4s
-        FMIN v31.4s, v31.4s, v4.4s
-        FMAX v20.4s, v20.4s, v5.4s
-        FMAX v21.4s, v21.4s, v5.4s
-        FMAX v22.4s, v22.4s, v5.4s
-        FMAX v23.4s, v23.4s, v5.4s
-        FMAX v24.4s, v24.4s, v5.4s
-        FMAX v25.4s, v25.4s, v5.4s
-        FMAX v26.4s, v26.4s, v5.4s
-        FMAX v27.4s, v27.4s, v5.4s
-        FMAX v28.4s, v28.4s, v5.4s
-        FMAX v29.4s, v29.4s, v5.4s
-        FMAX v30.4s, v30.4s, v5.4s
-        FMAX v31.4s, v31.4s, v5.4s
+        FMAX v21.4s, v21.4s, v4.4s
+        FMAX v22.4s, v22.4s, v4.4s
+        FMAX v23.4s, v23.4s, v4.4s
+        FMAX v24.4s, v24.4s, v4.4s
+        FMAX v25.4s, v25.4s, v4.4s
+        FMAX v26.4s, v26.4s, v4.4s
+        FMAX v27.4s, v27.4s, v4.4s
+        FMAX v28.4s, v28.4s, v4.4s
+        FMAX v29.4s, v29.4s, v4.4s
+        FMAX v30.4s, v30.4s, v4.4s
+        FMAX v31.4s, v31.4s, v4.4s
+        FMIN v20.4s, v20.4s, v5.4s
+        FMIN v21.4s, v21.4s, v5.4s
+        FMIN v22.4s, v22.4s, v5.4s
+        FMIN v23.4s, v23.4s, v5.4s
+        FMIN v24.4s, v24.4s, v5.4s
+        FMIN v25.4s, v25.4s, v5.4s
+        FMIN v26.4s, v26.4s, v5.4s
+        FMIN v27.4s, v27.4s, v5.4s
+        FMIN v28.4s, v28.4s, v5.4s
+        FMIN v29.4s, v29.4s, v5.4s
+        FMIN v30.4s, v30.4s, v5.4s
+        FMIN v31.4s, v31.4s, v5.4s
 
         # Store full 4 x 12
         B.LO 7f
diff --git a/src/f32-gemm/4x8-aarch32-neon-cortex-a53.S b/src/f32-gemm/4x8-aarch32-neon-cortex-a53.S
index b42672a..727be34 100644
--- a/src/f32-gemm/4x8-aarch32-neon-cortex-a53.S
+++ b/src/f32-gemm/4x8-aarch32-neon-cortex-a53.S
@@ -355,22 +355,22 @@
         VLD1.32     {d6[],d7[]}, [r5]
 
         // Clamp
-        VMIN.F32     q8,  q8, q2
-        VMIN.F32     q9,  q9, q2
-        VMIN.F32    q10, q10, q2
-        VMIN.F32    q11, q11, q2
-        VMIN.F32    q12, q12, q2
-        VMIN.F32    q13, q13, q2
-        VMIN.F32    q14, q14, q2
-        VMIN.F32    q15, q15, q2
-        VMAX.F32     q8,  q8, q3
-        VMAX.F32     q9,  q9, q3
-        VMAX.F32    q10, q10, q3
-        VMAX.F32    q11, q11, q3
-        VMAX.F32    q12, q12, q3
-        VMAX.F32    q13, q13, q3
-        VMAX.F32    q14, q14, q3
-        VMAX.F32    q15, q15, q3
+        VMAX.F32     q8,  q8, q2
+        VMAX.F32     q9,  q9, q2
+        VMAX.F32    q10, q10, q2
+        VMAX.F32    q11, q11, q2
+        VMAX.F32    q12, q12, q2
+        VMAX.F32    q13, q13, q2
+        VMAX.F32    q14, q14, q2
+        VMAX.F32    q15, q15, q2
+        VMIN.F32     q8,  q8, q3
+        VMIN.F32     q9,  q9, q3
+        VMIN.F32    q10, q10, q3
+        VMIN.F32    q11, q11, q3
+        VMIN.F32    q12, q12, q3
+        VMIN.F32    q13, q13, q3
+        VMIN.F32    q14, q14, q3
+        VMIN.F32    q15, q15, q3
 
         // Store full 4 x 8
         BLO         10f
diff --git a/src/f32-gemm/4x8-aarch32-neon-cortex-a55.S b/src/f32-gemm/4x8-aarch32-neon-cortex-a55.S
index ef22310..f812a1e 100644
--- a/src/f32-gemm/4x8-aarch32-neon-cortex-a55.S
+++ b/src/f32-gemm/4x8-aarch32-neon-cortex-a55.S
@@ -290,22 +290,22 @@
         VLD1.32     {d6[],d7[]}, [r5]
 
         // Clamp
-        VMIN.F32     q8,  q8, q2
-        VMIN.F32     q9,  q9, q2
-        VMIN.F32    q10, q10, q2
-        VMIN.F32    q11, q11, q2
-        VMIN.F32    q12, q12, q2
-        VMIN.F32    q13, q13, q2
-        VMIN.F32    q14, q14, q2
-        VMIN.F32    q15, q15, q2
-        VMAX.F32     q8,  q8, q3
-        VMAX.F32     q9,  q9, q3
-        VMAX.F32    q10, q10, q3
-        VMAX.F32    q11, q11, q3
-        VMAX.F32    q12, q12, q3
-        VMAX.F32    q13, q13, q3
-        VMAX.F32    q14, q14, q3
-        VMAX.F32    q15, q15, q3
+        VMAX.F32     q8,  q8, q2
+        VMAX.F32     q9,  q9, q2
+        VMAX.F32    q10, q10, q2
+        VMAX.F32    q11, q11, q2
+        VMAX.F32    q12, q12, q2
+        VMAX.F32    q13, q13, q2
+        VMAX.F32    q14, q14, q2
+        VMAX.F32    q15, q15, q2
+        VMIN.F32     q8,  q8, q3
+        VMIN.F32     q9,  q9, q3
+        VMIN.F32    q10, q10, q3
+        VMIN.F32    q11, q11, q3
+        VMIN.F32    q12, q12, q3
+        VMIN.F32    q13, q13, q3
+        VMIN.F32    q14, q14, q3
+        VMIN.F32    q15, q15, q3
 
         // Store full 4 x 8
         BLO         10f
diff --git a/src/f32-gemm/4x8-aarch32-neon-cortex-a75.S.in b/src/f32-gemm/4x8-aarch32-neon-cortex-a75.S.in
index dbe4ad1..05e9857 100644
--- a/src/f32-gemm/4x8-aarch32-neon-cortex-a75.S.in
+++ b/src/f32-gemm/4x8-aarch32-neon-cortex-a75.S.in
@@ -234,22 +234,22 @@
         VLD1.32     {d6[],d7[]}, [r5]
 
         // Clamp
-        VMIN.F32     q8,  q8, q2
-        VMIN.F32     q9,  q9, q2
-        VMIN.F32    q10, q10, q2
-        VMIN.F32    q11, q11, q2
-        VMIN.F32    q12, q12, q2
-        VMIN.F32    q13, q13, q2
-        VMIN.F32    q14, q14, q2
-        VMIN.F32    q15, q15, q2
-        VMAX.F32     q8,  q8, q3
-        VMAX.F32     q9,  q9, q3
-        VMAX.F32    q10, q10, q3
-        VMAX.F32    q11, q11, q3
-        VMAX.F32    q12, q12, q3
-        VMAX.F32    q13, q13, q3
-        VMAX.F32    q14, q14, q3
-        VMAX.F32    q15, q15, q3
+        VMAX.F32     q8,  q8, q2
+        VMAX.F32     q9,  q9, q2
+        VMAX.F32    q10, q10, q2
+        VMAX.F32    q11, q11, q2
+        VMAX.F32    q12, q12, q2
+        VMAX.F32    q13, q13, q2
+        VMAX.F32    q14, q14, q2
+        VMAX.F32    q15, q15, q2
+        VMIN.F32     q8,  q8, q3
+        VMIN.F32     q9,  q9, q3
+        VMIN.F32    q10, q10, q3
+        VMIN.F32    q11, q11, q3
+        VMIN.F32    q12, q12, q3
+        VMIN.F32    q13, q13, q3
+        VMIN.F32    q14, q14, q3
+        VMIN.F32    q15, q15, q3
 
         // Store full 4 x 8
         BLO         10f
diff --git a/src/f32-gemm/4x8-aarch32-neon-ld64.S b/src/f32-gemm/4x8-aarch32-neon-ld64.S
index 9614abd..a8e1ea2 100644
--- a/src/f32-gemm/4x8-aarch32-neon-ld64.S
+++ b/src/f32-gemm/4x8-aarch32-neon-ld64.S
@@ -122,23 +122,23 @@
 
 4:
         // Clamp
-        VMIN.F32     q8,  q8, q2
+        VMAX.F32     q8,  q8, q2
         SUBS        r1, r1, 8
-        VMIN.F32     q9,  q9, q2
-        VMIN.F32    q10, q10, q2
-        VMIN.F32    q11, q11, q2
-        VMIN.F32    q12, q12, q2
-        VMIN.F32    q13, q13, q2
-        VMIN.F32    q14, q14, q2
-        VMIN.F32    q15, q15, q2
-        VMAX.F32     q8,  q8, q3
-        VMAX.F32     q9,  q9, q3
-        VMAX.F32    q10, q10, q3
-        VMAX.F32    q11, q11, q3
-        VMAX.F32    q12, q12, q3
-        VMAX.F32    q13, q13, q3
-        VMAX.F32    q14, q14, q3
-        VMAX.F32    q15, q15, q3
+        VMAX.F32     q9,  q9, q2
+        VMAX.F32    q10, q10, q2
+        VMAX.F32    q11, q11, q2
+        VMAX.F32    q12, q12, q2
+        VMAX.F32    q13, q13, q2
+        VMAX.F32    q14, q14, q2
+        VMAX.F32    q15, q15, q2
+        VMIN.F32     q8,  q8, q3
+        VMIN.F32     q9,  q9, q3
+        VMIN.F32    q10, q10, q3
+        VMIN.F32    q11, q11, q3
+        VMIN.F32    q12, q12, q3
+        VMIN.F32    q13, q13, q3
+        VMIN.F32    q14, q14, q3
+        VMIN.F32    q15, q15, q3
 
         // Store full 4 x 8
         BLO         10f
diff --git a/src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in b/src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in
index 2c5108b..bad1adb 100644
--- a/src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in
+++ b/src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in
@@ -362,23 +362,23 @@
 
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
 
         # Store full 4 x 8
         B.LO 8f
diff --git a/src/f32-gemm/4x8-aarch64-neonfma-cortex-a55.S.in b/src/f32-gemm/4x8-aarch64-neonfma-cortex-a55.S.in
index df8d503..e6c0e0a 100644
--- a/src/f32-gemm/4x8-aarch64-neonfma-cortex-a55.S.in
+++ b/src/f32-gemm/4x8-aarch64-neonfma-cortex-a55.S.in
@@ -351,23 +351,23 @@
 
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
 
         # Store full 4 x 8
         B.LO 8f
diff --git a/src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in b/src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in
index 3587b38..810fe18 100644
--- a/src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in
+++ b/src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in
@@ -399,23 +399,23 @@
 
 6:
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
+        FMAX v16.4s, v16.4s, v4.4s
         SUBS x1, x1, 8
-        FMIN v17.4s, v17.4s, v4.4s
-        FMIN v18.4s, v18.4s, v4.4s
-        FMIN v19.4s, v19.4s, v4.4s
-        FMIN v28.4s, v28.4s, v4.4s
-        FMIN v29.4s, v29.4s, v4.4s
-        FMIN v30.4s, v30.4s, v4.4s
-        FMIN v31.4s, v31.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
-        FMAX v18.4s, v18.4s, v5.4s
-        FMAX v19.4s, v19.4s, v5.4s
-        FMAX v28.4s, v28.4s, v5.4s
-        FMAX v29.4s, v29.4s, v5.4s
-        FMAX v30.4s, v30.4s, v5.4s
-        FMAX v31.4s, v31.4s, v5.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMAX v18.4s, v18.4s, v4.4s
+        FMAX v19.4s, v19.4s, v4.4s
+        FMAX v28.4s, v28.4s, v4.4s
+        FMAX v29.4s, v29.4s, v4.4s
+        FMAX v30.4s, v30.4s, v4.4s
+        FMAX v31.4s, v31.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
+        FMIN v18.4s, v18.4s, v5.4s
+        FMIN v19.4s, v19.4s, v5.4s
+        FMIN v28.4s, v28.4s, v5.4s
+        FMIN v29.4s, v29.4s, v5.4s
+        FMIN v30.4s, v30.4s, v5.4s
+        FMIN v31.4s, v31.4s, v5.4s
 
         # Store full 4 x 8
         B.LO 7f
diff --git a/src/f32-gemm/4x8-aarch64-neonfma-ld128.S.in b/src/f32-gemm/4x8-aarch64-neonfma-ld128.S.in
index aab2dbd..571669f 100644
--- a/src/f32-gemm/4x8-aarch64-neonfma-ld128.S.in
+++ b/src/f32-gemm/4x8-aarch64-neonfma-ld128.S.in
@@ -140,23 +140,23 @@
 
 4:
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
+        FMAX v16.4s, v16.4s, v4.4s
         SUBS x1, x1, 8
-        FMIN v17.4s, v17.4s, v4.4s
-        FMIN v18.4s, v18.4s, v4.4s
-        FMIN v19.4s, v19.4s, v4.4s
-        FMIN v28.4s, v28.4s, v4.4s
-        FMIN v29.4s, v29.4s, v4.4s
-        FMIN v30.4s, v30.4s, v4.4s
-        FMIN v31.4s, v31.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
-        FMAX v18.4s, v18.4s, v5.4s
-        FMAX v19.4s, v19.4s, v5.4s
-        FMAX v28.4s, v28.4s, v5.4s
-        FMAX v29.4s, v29.4s, v5.4s
-        FMAX v30.4s, v30.4s, v5.4s
-        FMAX v31.4s, v31.4s, v5.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMAX v18.4s, v18.4s, v4.4s
+        FMAX v19.4s, v19.4s, v4.4s
+        FMAX v28.4s, v28.4s, v4.4s
+        FMAX v29.4s, v29.4s, v4.4s
+        FMAX v30.4s, v30.4s, v4.4s
+        FMAX v31.4s, v31.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
+        FMIN v18.4s, v18.4s, v5.4s
+        FMIN v19.4s, v19.4s, v5.4s
+        FMIN v28.4s, v28.4s, v5.4s
+        FMIN v29.4s, v29.4s, v5.4s
+        FMIN v30.4s, v30.4s, v5.4s
+        FMIN v31.4s, v31.4s, v5.4s
 
         # Store full 4 x 8
         B.LO 7f
diff --git a/src/f32-gemm/4x8-aarch64-neonfma-ld64.S.in b/src/f32-gemm/4x8-aarch64-neonfma-ld64.S.in
index 8147e90..3da8b3d 100644
--- a/src/f32-gemm/4x8-aarch64-neonfma-ld64.S.in
+++ b/src/f32-gemm/4x8-aarch64-neonfma-ld64.S.in
@@ -122,23 +122,23 @@
 
 4:
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
+        FMAX v16.4s, v16.4s, v4.4s
         SUBS x1, x1, 8
-        FMIN v17.4s, v17.4s, v4.4s
-        FMIN v18.4s, v18.4s, v4.4s
-        FMIN v19.4s, v19.4s, v4.4s
-        FMIN v28.4s, v28.4s, v4.4s
-        FMIN v29.4s, v29.4s, v4.4s
-        FMIN v30.4s, v30.4s, v4.4s
-        FMIN v31.4s, v31.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
-        FMAX v18.4s, v18.4s, v5.4s
-        FMAX v19.4s, v19.4s, v5.4s
-        FMAX v28.4s, v28.4s, v5.4s
-        FMAX v29.4s, v29.4s, v5.4s
-        FMAX v30.4s, v30.4s, v5.4s
-        FMAX v31.4s, v31.4s, v5.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMAX v18.4s, v18.4s, v4.4s
+        FMAX v19.4s, v19.4s, v4.4s
+        FMAX v28.4s, v28.4s, v4.4s
+        FMAX v29.4s, v29.4s, v4.4s
+        FMAX v30.4s, v30.4s, v4.4s
+        FMAX v31.4s, v31.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
+        FMIN v18.4s, v18.4s, v5.4s
+        FMIN v19.4s, v19.4s, v5.4s
+        FMIN v28.4s, v28.4s, v5.4s
+        FMIN v29.4s, v29.4s, v5.4s
+        FMIN v30.4s, v30.4s, v5.4s
+        FMIN v31.4s, v31.4s, v5.4s
 
         # Store full 4 x 8
         B.LO 7f
diff --git a/src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in b/src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in
index a373267..bfff0e3 100644
--- a/src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in
+++ b/src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in
@@ -396,27 +396,27 @@
 
         # Clamp
 3:
-        FMIN v20.4s, v20.4s, v30.4s
+        FMAX v20.4s, v20.4s, v30.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v30.4s
-        FMIN v22.4s, v22.4s, v30.4s
-        FMIN v23.4s, v23.4s, v30.4s
-        FMIN v24.4s, v24.4s, v30.4s
-        FMIN v25.4s, v25.4s, v30.4s
-        FMIN v26.4s, v26.4s, v30.4s
-        FMIN v27.4s, v27.4s, v30.4s
-        FMIN v28.4s, v28.4s, v30.4s
-        FMIN v29.4s, v29.4s, v30.4s
-        FMAX v20.4s, v20.4s, v31.4s
-        FMAX v21.4s, v21.4s, v31.4s
-        FMAX v22.4s, v22.4s, v31.4s
-        FMAX v23.4s, v23.4s, v31.4s
-        FMAX v24.4s, v24.4s, v31.4s
-        FMAX v25.4s, v25.4s, v31.4s
-        FMAX v26.4s, v26.4s, v31.4s
-        FMAX v27.4s, v27.4s, v31.4s
-        FMAX v28.4s, v28.4s, v31.4s
-        FMAX v29.4s, v29.4s, v31.4s
+        FMAX v21.4s, v21.4s, v30.4s
+        FMAX v22.4s, v22.4s, v30.4s
+        FMAX v23.4s, v23.4s, v30.4s
+        FMAX v24.4s, v24.4s, v30.4s
+        FMAX v25.4s, v25.4s, v30.4s
+        FMAX v26.4s, v26.4s, v30.4s
+        FMAX v27.4s, v27.4s, v30.4s
+        FMAX v28.4s, v28.4s, v30.4s
+        FMAX v29.4s, v29.4s, v30.4s
+        FMIN v20.4s, v20.4s, v31.4s
+        FMIN v21.4s, v21.4s, v31.4s
+        FMIN v22.4s, v22.4s, v31.4s
+        FMIN v23.4s, v23.4s, v31.4s
+        FMIN v24.4s, v24.4s, v31.4s
+        FMIN v25.4s, v25.4s, v31.4s
+        FMIN v26.4s, v26.4s, v31.4s
+        FMIN v27.4s, v27.4s, v31.4s
+        FMIN v28.4s, v28.4s, v31.4s
+        FMIN v29.4s, v29.4s, v31.4s
 
         # Store full 5 x 8
         B.LO 7f
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in b/src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in
index fa7056c..32bb07f 100644
--- a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in
+++ b/src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in
@@ -442,31 +442,31 @@
         B.NE 5f
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 8f
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a55.S.in b/src/f32-gemm/6x8-aarch64-neonfma-cortex-a55.S.in
index 50d89ee..f65ceeb 100644
--- a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a55.S.in
+++ b/src/f32-gemm/6x8-aarch64-neonfma-cortex-a55.S.in
@@ -439,31 +439,31 @@
         B.NE 5f
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 8f
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S.in b/src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S.in
index 4546a7e..4540f89 100644
--- a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S.in
+++ b/src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S.in
@@ -429,30 +429,30 @@
         # Clamp
 3:
         SUBS x1, x1, 8
-        FMIN v20.4s, v20.4s, v6.4s
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v20.4s, v20.4s, v6.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         NOP
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in b/src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in
index 051a99c..414fd59 100644
--- a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in
+++ b/src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in
@@ -446,31 +446,31 @@
 
         # Clamp
 3:
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 7f
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-ios.S.in b/src/f32-gemm/6x8-aarch64-neonfma-ios.S.in
index 07f9156..7050881 100644
--- a/src/f32-gemm/6x8-aarch64-neonfma-ios.S.in
+++ b/src/f32-gemm/6x8-aarch64-neonfma-ios.S.in
@@ -446,33 +446,33 @@
 
         # Clamp
 3:
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
         # Load cn_stride
         LDR x0, [sp, 64]
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 7f
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-ld128.S.in b/src/f32-gemm/6x8-aarch64-neonfma-ld128.S.in
index 50b74f3..f1a19a3 100644
--- a/src/f32-gemm/6x8-aarch64-neonfma-ld128.S.in
+++ b/src/f32-gemm/6x8-aarch64-neonfma-ld128.S.in
@@ -224,31 +224,31 @@
 
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 7f
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in b/src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in
index daae4a0..4d2c125 100644
--- a/src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in
+++ b/src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in
@@ -194,31 +194,31 @@
         TBNZ x0, 2, 4f
 3:
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 5f
diff --git a/src/f32-gemm/gen-inc/1x12-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen-inc/1x12-aarch64-neonfma-cortex-a53.S
index d096786..a057e25 100644
--- a/src/f32-gemm/gen-inc/1x12-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/gen-inc/1x12-aarch64-neonfma-cortex-a53.S
@@ -268,13 +268,13 @@
 
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v30.4s
+        FMAX v20.4s, v20.4s, v30.4s
         SUBS x1, x1, 12
-        FMIN v21.4s, v21.4s, v30.4s
-        FMIN v22.4s, v22.4s, v30.4s
-        FMAX v20.4s, v20.4s, v31.4s
-        FMAX v21.4s, v21.4s, v31.4s
-        FMAX v22.4s, v22.4s, v31.4s
+        FMAX v21.4s, v21.4s, v30.4s
+        FMAX v22.4s, v22.4s, v30.4s
+        FMIN v20.4s, v20.4s, v31.4s
+        FMIN v21.4s, v21.4s, v31.4s
+        FMIN v22.4s, v22.4s, v31.4s
 
         # Store full 1 x 12
         B.LO 7f
diff --git a/src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a53.S
index d88b63e..cce223c 100644
--- a/src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a53.S
@@ -159,11 +159,11 @@
         FADD v17.4s, v17.4s, v19.4s
 
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
+        FMAX v16.4s, v16.4s, v4.4s
         SUBS x1, x1, 8
-        FMIN v17.4s, v17.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
 
         # Store full 1 x 8
         B.LO 9f
diff --git a/src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a57.S b/src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a57.S
index 9065ac2..728f32d 100644
--- a/src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a57.S
+++ b/src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a57.S
@@ -142,10 +142,10 @@
         FADD v17.4s, v17.4s, v19.4s
 
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
-        FMIN v17.4s, v17.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
+        FMAX v16.4s, v16.4s, v4.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
 
         # Store full 1 x 8
         B.LO 9f
diff --git a/src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a75.S b/src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a75.S
index 3e3df8a..606cf5a 100644
--- a/src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a75.S
+++ b/src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a75.S
@@ -147,10 +147,10 @@
         FADD v17.4s, v17.4s, v19.4s
 
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
-        FMIN v17.4s, v17.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
+        FMAX v16.4s, v16.4s, v4.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
 
         # Store full 1 x 8
         B.LO 9f
diff --git a/src/f32-gemm/gen-inc/4x12-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen-inc/4x12-aarch64-neonfma-cortex-a53.S
index 897ad35..8fc40af 100644
--- a/src/f32-gemm/gen-inc/4x12-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/gen-inc/4x12-aarch64-neonfma-cortex-a53.S
@@ -399,31 +399,31 @@
 
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v4.4s
+        FMAX v20.4s, v20.4s, v4.4s
         SUBS x1, x1, 12
-        FMIN v21.4s, v21.4s, v4.4s
-        FMIN v22.4s, v22.4s, v4.4s
-        FMIN v23.4s, v23.4s, v4.4s
-        FMIN v24.4s, v24.4s, v4.4s
-        FMIN v25.4s, v25.4s, v4.4s
-        FMIN v26.4s, v26.4s, v4.4s
-        FMIN v27.4s, v27.4s, v4.4s
-        FMIN v28.4s, v28.4s, v4.4s
-        FMIN v29.4s, v29.4s, v4.4s
-        FMIN v30.4s, v30.4s, v4.4s
-        FMIN v31.4s, v31.4s, v4.4s
-        FMAX v20.4s, v20.4s, v5.4s
-        FMAX v21.4s, v21.4s, v5.4s
-        FMAX v22.4s, v22.4s, v5.4s
-        FMAX v23.4s, v23.4s, v5.4s
-        FMAX v24.4s, v24.4s, v5.4s
-        FMAX v25.4s, v25.4s, v5.4s
-        FMAX v26.4s, v26.4s, v5.4s
-        FMAX v27.4s, v27.4s, v5.4s
-        FMAX v28.4s, v28.4s, v5.4s
-        FMAX v29.4s, v29.4s, v5.4s
-        FMAX v30.4s, v30.4s, v5.4s
-        FMAX v31.4s, v31.4s, v5.4s
+        FMAX v21.4s, v21.4s, v4.4s
+        FMAX v22.4s, v22.4s, v4.4s
+        FMAX v23.4s, v23.4s, v4.4s
+        FMAX v24.4s, v24.4s, v4.4s
+        FMAX v25.4s, v25.4s, v4.4s
+        FMAX v26.4s, v26.4s, v4.4s
+        FMAX v27.4s, v27.4s, v4.4s
+        FMAX v28.4s, v28.4s, v4.4s
+        FMAX v29.4s, v29.4s, v4.4s
+        FMAX v30.4s, v30.4s, v4.4s
+        FMAX v31.4s, v31.4s, v4.4s
+        FMIN v20.4s, v20.4s, v5.4s
+        FMIN v21.4s, v21.4s, v5.4s
+        FMIN v22.4s, v22.4s, v5.4s
+        FMIN v23.4s, v23.4s, v5.4s
+        FMIN v24.4s, v24.4s, v5.4s
+        FMIN v25.4s, v25.4s, v5.4s
+        FMIN v26.4s, v26.4s, v5.4s
+        FMIN v27.4s, v27.4s, v5.4s
+        FMIN v28.4s, v28.4s, v5.4s
+        FMIN v29.4s, v29.4s, v5.4s
+        FMIN v30.4s, v30.4s, v5.4s
+        FMIN v31.4s, v31.4s, v5.4s
 
         # Store full 4 x 12
         B.LO 7f
diff --git a/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a53.S
index b0ef94f..6b7cb5e 100644
--- a/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a53.S
@@ -337,23 +337,23 @@
 
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
 
         # Store full 4 x 8
         B.LO 8f
diff --git a/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a55.S b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a55.S
index 534f6e4..6ea2a9d 100644
--- a/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a55.S
+++ b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a55.S
@@ -326,23 +326,23 @@
 
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
 
         # Store full 4 x 8
         B.LO 8f
diff --git a/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a57.S b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a57.S
index 211adc4..4e9e5c8 100644
--- a/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a57.S
+++ b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a57.S
@@ -378,23 +378,23 @@
 
 6:
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
+        FMAX v16.4s, v16.4s, v4.4s
         SUBS x1, x1, 8
-        FMIN v17.4s, v17.4s, v4.4s
-        FMIN v18.4s, v18.4s, v4.4s
-        FMIN v19.4s, v19.4s, v4.4s
-        FMIN v28.4s, v28.4s, v4.4s
-        FMIN v29.4s, v29.4s, v4.4s
-        FMIN v30.4s, v30.4s, v4.4s
-        FMIN v31.4s, v31.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
-        FMAX v18.4s, v18.4s, v5.4s
-        FMAX v19.4s, v19.4s, v5.4s
-        FMAX v28.4s, v28.4s, v5.4s
-        FMAX v29.4s, v29.4s, v5.4s
-        FMAX v30.4s, v30.4s, v5.4s
-        FMAX v31.4s, v31.4s, v5.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMAX v18.4s, v18.4s, v4.4s
+        FMAX v19.4s, v19.4s, v4.4s
+        FMAX v28.4s, v28.4s, v4.4s
+        FMAX v29.4s, v29.4s, v4.4s
+        FMAX v30.4s, v30.4s, v4.4s
+        FMAX v31.4s, v31.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
+        FMIN v18.4s, v18.4s, v5.4s
+        FMIN v19.4s, v19.4s, v5.4s
+        FMIN v28.4s, v28.4s, v5.4s
+        FMIN v29.4s, v29.4s, v5.4s
+        FMIN v30.4s, v30.4s, v5.4s
+        FMIN v31.4s, v31.4s, v5.4s
 
         # Store full 4 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a75.S b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a75.S
index 1d150aa..4df1917 100644
--- a/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a75.S
+++ b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a75.S
@@ -382,23 +382,23 @@
 
 6:
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
+        FMAX v16.4s, v16.4s, v4.4s
         SUBS x1, x1, 8
-        FMIN v17.4s, v17.4s, v4.4s
-        FMIN v18.4s, v18.4s, v4.4s
-        FMIN v19.4s, v19.4s, v4.4s
-        FMIN v28.4s, v28.4s, v4.4s
-        FMIN v29.4s, v29.4s, v4.4s
-        FMIN v30.4s, v30.4s, v4.4s
-        FMIN v31.4s, v31.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
-        FMAX v18.4s, v18.4s, v5.4s
-        FMAX v19.4s, v19.4s, v5.4s
-        FMAX v28.4s, v28.4s, v5.4s
-        FMAX v29.4s, v29.4s, v5.4s
-        FMAX v30.4s, v30.4s, v5.4s
-        FMAX v31.4s, v31.4s, v5.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMAX v18.4s, v18.4s, v4.4s
+        FMAX v19.4s, v19.4s, v4.4s
+        FMAX v28.4s, v28.4s, v4.4s
+        FMAX v29.4s, v29.4s, v4.4s
+        FMAX v30.4s, v30.4s, v4.4s
+        FMAX v31.4s, v31.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
+        FMIN v18.4s, v18.4s, v5.4s
+        FMIN v19.4s, v19.4s, v5.4s
+        FMIN v28.4s, v28.4s, v5.4s
+        FMIN v29.4s, v29.4s, v5.4s
+        FMIN v30.4s, v30.4s, v5.4s
+        FMIN v31.4s, v31.4s, v5.4s
 
         # Store full 4 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld128.S b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld128.S
index 87d7ddb..27371b0 100644
--- a/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld128.S
+++ b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld128.S
@@ -127,23 +127,23 @@
 
 4:
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
+        FMAX v16.4s, v16.4s, v4.4s
         SUBS x1, x1, 8
-        FMIN v17.4s, v17.4s, v4.4s
-        FMIN v18.4s, v18.4s, v4.4s
-        FMIN v19.4s, v19.4s, v4.4s
-        FMIN v28.4s, v28.4s, v4.4s
-        FMIN v29.4s, v29.4s, v4.4s
-        FMIN v30.4s, v30.4s, v4.4s
-        FMIN v31.4s, v31.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
-        FMAX v18.4s, v18.4s, v5.4s
-        FMAX v19.4s, v19.4s, v5.4s
-        FMAX v28.4s, v28.4s, v5.4s
-        FMAX v29.4s, v29.4s, v5.4s
-        FMAX v30.4s, v30.4s, v5.4s
-        FMAX v31.4s, v31.4s, v5.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMAX v18.4s, v18.4s, v4.4s
+        FMAX v19.4s, v19.4s, v4.4s
+        FMAX v28.4s, v28.4s, v4.4s
+        FMAX v29.4s, v29.4s, v4.4s
+        FMAX v30.4s, v30.4s, v4.4s
+        FMAX v31.4s, v31.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
+        FMIN v18.4s, v18.4s, v5.4s
+        FMIN v19.4s, v19.4s, v5.4s
+        FMIN v28.4s, v28.4s, v5.4s
+        FMIN v29.4s, v29.4s, v5.4s
+        FMIN v30.4s, v30.4s, v5.4s
+        FMIN v31.4s, v31.4s, v5.4s
 
         # Store full 4 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld64.S b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld64.S
index 6d146e1..183301c 100644
--- a/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld64.S
+++ b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld64.S
@@ -109,23 +109,23 @@
 
 4:
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
+        FMAX v16.4s, v16.4s, v4.4s
         SUBS x1, x1, 8
-        FMIN v17.4s, v17.4s, v4.4s
-        FMIN v18.4s, v18.4s, v4.4s
-        FMIN v19.4s, v19.4s, v4.4s
-        FMIN v28.4s, v28.4s, v4.4s
-        FMIN v29.4s, v29.4s, v4.4s
-        FMIN v30.4s, v30.4s, v4.4s
-        FMIN v31.4s, v31.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
-        FMAX v18.4s, v18.4s, v5.4s
-        FMAX v19.4s, v19.4s, v5.4s
-        FMAX v28.4s, v28.4s, v5.4s
-        FMAX v29.4s, v29.4s, v5.4s
-        FMAX v30.4s, v30.4s, v5.4s
-        FMAX v31.4s, v31.4s, v5.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMAX v18.4s, v18.4s, v4.4s
+        FMAX v19.4s, v19.4s, v4.4s
+        FMAX v28.4s, v28.4s, v4.4s
+        FMAX v29.4s, v29.4s, v4.4s
+        FMAX v30.4s, v30.4s, v4.4s
+        FMAX v31.4s, v31.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
+        FMIN v18.4s, v18.4s, v5.4s
+        FMIN v19.4s, v19.4s, v5.4s
+        FMIN v28.4s, v28.4s, v5.4s
+        FMIN v29.4s, v29.4s, v5.4s
+        FMIN v30.4s, v30.4s, v5.4s
+        FMIN v31.4s, v31.4s, v5.4s
 
         # Store full 4 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen-inc/5x8-aarch64-neonfma-cortex-a57.S b/src/f32-gemm/gen-inc/5x8-aarch64-neonfma-cortex-a57.S
index c69d119..40da71d 100644
--- a/src/f32-gemm/gen-inc/5x8-aarch64-neonfma-cortex-a57.S
+++ b/src/f32-gemm/gen-inc/5x8-aarch64-neonfma-cortex-a57.S
@@ -346,27 +346,27 @@
 
         # Clamp
 3:
-        FMIN v20.4s, v20.4s, v30.4s
+        FMAX v20.4s, v20.4s, v30.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v30.4s
-        FMIN v22.4s, v22.4s, v30.4s
-        FMIN v23.4s, v23.4s, v30.4s
-        FMIN v24.4s, v24.4s, v30.4s
-        FMIN v25.4s, v25.4s, v30.4s
-        FMIN v26.4s, v26.4s, v30.4s
-        FMIN v27.4s, v27.4s, v30.4s
-        FMIN v28.4s, v28.4s, v30.4s
-        FMIN v29.4s, v29.4s, v30.4s
-        FMAX v20.4s, v20.4s, v31.4s
-        FMAX v21.4s, v21.4s, v31.4s
-        FMAX v22.4s, v22.4s, v31.4s
-        FMAX v23.4s, v23.4s, v31.4s
-        FMAX v24.4s, v24.4s, v31.4s
-        FMAX v25.4s, v25.4s, v31.4s
-        FMAX v26.4s, v26.4s, v31.4s
-        FMAX v27.4s, v27.4s, v31.4s
-        FMAX v28.4s, v28.4s, v31.4s
-        FMAX v29.4s, v29.4s, v31.4s
+        FMAX v21.4s, v21.4s, v30.4s
+        FMAX v22.4s, v22.4s, v30.4s
+        FMAX v23.4s, v23.4s, v30.4s
+        FMAX v24.4s, v24.4s, v30.4s
+        FMAX v25.4s, v25.4s, v30.4s
+        FMAX v26.4s, v26.4s, v30.4s
+        FMAX v27.4s, v27.4s, v30.4s
+        FMAX v28.4s, v28.4s, v30.4s
+        FMAX v29.4s, v29.4s, v30.4s
+        FMIN v20.4s, v20.4s, v31.4s
+        FMIN v21.4s, v21.4s, v31.4s
+        FMIN v22.4s, v22.4s, v31.4s
+        FMIN v23.4s, v23.4s, v31.4s
+        FMIN v24.4s, v24.4s, v31.4s
+        FMIN v25.4s, v25.4s, v31.4s
+        FMIN v26.4s, v26.4s, v31.4s
+        FMIN v27.4s, v27.4s, v31.4s
+        FMIN v28.4s, v28.4s, v31.4s
+        FMIN v29.4s, v29.4s, v31.4s
 
         # Store full 5 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen-inc/5x8-aarch64-neonfma-cortex-a75.S b/src/f32-gemm/gen-inc/5x8-aarch64-neonfma-cortex-a75.S
index 4a0dac4..7de201a 100644
--- a/src/f32-gemm/gen-inc/5x8-aarch64-neonfma-cortex-a75.S
+++ b/src/f32-gemm/gen-inc/5x8-aarch64-neonfma-cortex-a75.S
@@ -359,27 +359,27 @@
 
         # Clamp
 3:
-        FMIN v20.4s, v20.4s, v30.4s
+        FMAX v20.4s, v20.4s, v30.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v30.4s
-        FMIN v22.4s, v22.4s, v30.4s
-        FMIN v23.4s, v23.4s, v30.4s
-        FMIN v24.4s, v24.4s, v30.4s
-        FMIN v25.4s, v25.4s, v30.4s
-        FMIN v26.4s, v26.4s, v30.4s
-        FMIN v27.4s, v27.4s, v30.4s
-        FMIN v28.4s, v28.4s, v30.4s
-        FMIN v29.4s, v29.4s, v30.4s
-        FMAX v20.4s, v20.4s, v31.4s
-        FMAX v21.4s, v21.4s, v31.4s
-        FMAX v22.4s, v22.4s, v31.4s
-        FMAX v23.4s, v23.4s, v31.4s
-        FMAX v24.4s, v24.4s, v31.4s
-        FMAX v25.4s, v25.4s, v31.4s
-        FMAX v26.4s, v26.4s, v31.4s
-        FMAX v27.4s, v27.4s, v31.4s
-        FMAX v28.4s, v28.4s, v31.4s
-        FMAX v29.4s, v29.4s, v31.4s
+        FMAX v21.4s, v21.4s, v30.4s
+        FMAX v22.4s, v22.4s, v30.4s
+        FMAX v23.4s, v23.4s, v30.4s
+        FMAX v24.4s, v24.4s, v30.4s
+        FMAX v25.4s, v25.4s, v30.4s
+        FMAX v26.4s, v26.4s, v30.4s
+        FMAX v27.4s, v27.4s, v30.4s
+        FMAX v28.4s, v28.4s, v30.4s
+        FMAX v29.4s, v29.4s, v30.4s
+        FMIN v20.4s, v20.4s, v31.4s
+        FMIN v21.4s, v21.4s, v31.4s
+        FMIN v22.4s, v22.4s, v31.4s
+        FMIN v23.4s, v23.4s, v31.4s
+        FMIN v24.4s, v24.4s, v31.4s
+        FMIN v25.4s, v25.4s, v31.4s
+        FMIN v26.4s, v26.4s, v31.4s
+        FMIN v27.4s, v27.4s, v31.4s
+        FMIN v28.4s, v28.4s, v31.4s
+        FMIN v29.4s, v29.4s, v31.4s
 
         # Store full 5 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a53.S
index 0b5cb6e..41b2ebc 100644
--- a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a53.S
@@ -409,31 +409,31 @@
         B.NE 5f
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 8f
diff --git a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a55.S b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a55.S
index fbfb1c6..a330e03 100644
--- a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a55.S
+++ b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a55.S
@@ -403,31 +403,31 @@
         B.NE 5f
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 8f
diff --git a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a57.S b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a57.S
index 2178bc3..7757e48 100644
--- a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a57.S
+++ b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a57.S
@@ -390,31 +390,31 @@
 
         # Clamp
 3:
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a73.S b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a73.S
index 316a31f..59592bf 100644
--- a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a73.S
+++ b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a73.S
@@ -402,30 +402,30 @@
         # Clamp
 3:
         SUBS x1, x1, 8
-        FMIN v20.4s, v20.4s, v6.4s
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v20.4s, v20.4s, v6.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         NOP
diff --git a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a75.S b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a75.S
index 870a737..b1d9ed1 100644
--- a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a75.S
+++ b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a75.S
@@ -404,31 +404,31 @@
 
         # Clamp
 3:
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ios.S b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ios.S
index 9a7eab7..3780442 100644
--- a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ios.S
+++ b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ios.S
@@ -390,33 +390,33 @@
 
         # Clamp
 3:
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
         # Load cn_stride
         LDR x0, [sp, 64]
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S
index 2fdf4f1..8bfd7b4 100644
--- a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S
+++ b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S
@@ -197,31 +197,31 @@
 
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld64.S b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld64.S
index eaa68f7..bb79b4e 100644
--- a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld64.S
+++ b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld64.S
@@ -167,31 +167,31 @@
         TBNZ x0, 2, 4f
 3:
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 5f
diff --git a/src/f32-gemm/gen/1x12-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen/1x12-aarch64-neonfma-cortex-a53.S
index e6b0171..7b6f40b 100644
--- a/src/f32-gemm/gen/1x12-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/gen/1x12-aarch64-neonfma-cortex-a53.S
@@ -265,13 +265,13 @@
 
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v30.4s
+        FMAX v20.4s, v20.4s, v30.4s
         SUBS x1, x1, 12
-        FMIN v21.4s, v21.4s, v30.4s
-        FMIN v22.4s, v22.4s, v30.4s
-        FMAX v20.4s, v20.4s, v31.4s
-        FMAX v21.4s, v21.4s, v31.4s
-        FMAX v22.4s, v22.4s, v31.4s
+        FMAX v21.4s, v21.4s, v30.4s
+        FMAX v22.4s, v22.4s, v30.4s
+        FMIN v20.4s, v20.4s, v31.4s
+        FMIN v21.4s, v21.4s, v31.4s
+        FMIN v22.4s, v22.4s, v31.4s
 
         # Store full 1 x 12
         B.LO 7f
diff --git a/src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a53.S
index 0e9c005..6882340 100644
--- a/src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a53.S
@@ -156,11 +156,11 @@
         FADD v17.4s, v17.4s, v19.4s
 
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
+        FMAX v16.4s, v16.4s, v4.4s
         SUBS x1, x1, 8
-        FMIN v17.4s, v17.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
 
         # Store full 1 x 8
         B.LO 9f
diff --git a/src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a57.S b/src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a57.S
index 01b3e13..d842c84 100644
--- a/src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a57.S
+++ b/src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a57.S
@@ -139,10 +139,10 @@
         FADD v17.4s, v17.4s, v19.4s
 
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
-        FMIN v17.4s, v17.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
+        FMAX v16.4s, v16.4s, v4.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
 
         # Store full 1 x 8
         B.LO 9f
diff --git a/src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a75.S b/src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a75.S
index ad59e3c..4258e67 100644
--- a/src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a75.S
+++ b/src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a75.S
@@ -144,10 +144,10 @@
         FADD v17.4s, v17.4s, v19.4s
 
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
-        FMIN v17.4s, v17.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
+        FMAX v16.4s, v16.4s, v4.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
 
         # Store full 1 x 8
         B.LO 9f
diff --git a/src/f32-gemm/gen/4x12-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen/4x12-aarch64-neonfma-cortex-a53.S
index 257ab4b..b9401df 100644
--- a/src/f32-gemm/gen/4x12-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/gen/4x12-aarch64-neonfma-cortex-a53.S
@@ -402,31 +402,31 @@
 
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v4.4s
+        FMAX v20.4s, v20.4s, v4.4s
         SUBS x1, x1, 12
-        FMIN v21.4s, v21.4s, v4.4s
-        FMIN v22.4s, v22.4s, v4.4s
-        FMIN v23.4s, v23.4s, v4.4s
-        FMIN v24.4s, v24.4s, v4.4s
-        FMIN v25.4s, v25.4s, v4.4s
-        FMIN v26.4s, v26.4s, v4.4s
-        FMIN v27.4s, v27.4s, v4.4s
-        FMIN v28.4s, v28.4s, v4.4s
-        FMIN v29.4s, v29.4s, v4.4s
-        FMIN v30.4s, v30.4s, v4.4s
-        FMIN v31.4s, v31.4s, v4.4s
-        FMAX v20.4s, v20.4s, v5.4s
-        FMAX v21.4s, v21.4s, v5.4s
-        FMAX v22.4s, v22.4s, v5.4s
-        FMAX v23.4s, v23.4s, v5.4s
-        FMAX v24.4s, v24.4s, v5.4s
-        FMAX v25.4s, v25.4s, v5.4s
-        FMAX v26.4s, v26.4s, v5.4s
-        FMAX v27.4s, v27.4s, v5.4s
-        FMAX v28.4s, v28.4s, v5.4s
-        FMAX v29.4s, v29.4s, v5.4s
-        FMAX v30.4s, v30.4s, v5.4s
-        FMAX v31.4s, v31.4s, v5.4s
+        FMAX v21.4s, v21.4s, v4.4s
+        FMAX v22.4s, v22.4s, v4.4s
+        FMAX v23.4s, v23.4s, v4.4s
+        FMAX v24.4s, v24.4s, v4.4s
+        FMAX v25.4s, v25.4s, v4.4s
+        FMAX v26.4s, v26.4s, v4.4s
+        FMAX v27.4s, v27.4s, v4.4s
+        FMAX v28.4s, v28.4s, v4.4s
+        FMAX v29.4s, v29.4s, v4.4s
+        FMAX v30.4s, v30.4s, v4.4s
+        FMAX v31.4s, v31.4s, v4.4s
+        FMIN v20.4s, v20.4s, v5.4s
+        FMIN v21.4s, v21.4s, v5.4s
+        FMIN v22.4s, v22.4s, v5.4s
+        FMIN v23.4s, v23.4s, v5.4s
+        FMIN v24.4s, v24.4s, v5.4s
+        FMIN v25.4s, v25.4s, v5.4s
+        FMIN v26.4s, v26.4s, v5.4s
+        FMIN v27.4s, v27.4s, v5.4s
+        FMIN v28.4s, v28.4s, v5.4s
+        FMIN v29.4s, v29.4s, v5.4s
+        FMIN v30.4s, v30.4s, v5.4s
+        FMIN v31.4s, v31.4s, v5.4s
 
         # Store full 4 x 12
         B.LO 7f
diff --git a/src/f32-gemm/gen/4x8-aarch32-neon-cortex-a75.S b/src/f32-gemm/gen/4x8-aarch32-neon-cortex-a75.S
index 09af875..0b28b36 100644
--- a/src/f32-gemm/gen/4x8-aarch32-neon-cortex-a75.S
+++ b/src/f32-gemm/gen/4x8-aarch32-neon-cortex-a75.S
@@ -211,22 +211,22 @@
         VLD1.32     {d6[],d7[]}, [r5]
 
         // Clamp
-        VMIN.F32     q8,  q8, q2
-        VMIN.F32     q9,  q9, q2
-        VMIN.F32    q10, q10, q2
-        VMIN.F32    q11, q11, q2
-        VMIN.F32    q12, q12, q2
-        VMIN.F32    q13, q13, q2
-        VMIN.F32    q14, q14, q2
-        VMIN.F32    q15, q15, q2
-        VMAX.F32     q8,  q8, q3
-        VMAX.F32     q9,  q9, q3
-        VMAX.F32    q10, q10, q3
-        VMAX.F32    q11, q11, q3
-        VMAX.F32    q12, q12, q3
-        VMAX.F32    q13, q13, q3
-        VMAX.F32    q14, q14, q3
-        VMAX.F32    q15, q15, q3
+        VMAX.F32     q8,  q8, q2
+        VMAX.F32     q9,  q9, q2
+        VMAX.F32    q10, q10, q2
+        VMAX.F32    q11, q11, q2
+        VMAX.F32    q12, q12, q2
+        VMAX.F32    q13, q13, q2
+        VMAX.F32    q14, q14, q2
+        VMAX.F32    q15, q15, q2
+        VMIN.F32     q8,  q8, q3
+        VMIN.F32     q9,  q9, q3
+        VMIN.F32    q10, q10, q3
+        VMIN.F32    q11, q11, q3
+        VMIN.F32    q12, q12, q3
+        VMIN.F32    q13, q13, q3
+        VMIN.F32    q14, q14, q3
+        VMIN.F32    q15, q15, q3
 
         // Store full 4 x 8
         BLO         10f
diff --git a/src/f32-gemm/gen/4x8-aarch32-neon-pld-cortex-a75.S b/src/f32-gemm/gen/4x8-aarch32-neon-pld-cortex-a75.S
index 10aac67..b2e9d0c 100644
--- a/src/f32-gemm/gen/4x8-aarch32-neon-pld-cortex-a75.S
+++ b/src/f32-gemm/gen/4x8-aarch32-neon-pld-cortex-a75.S
@@ -231,22 +231,22 @@
         VLD1.32     {d6[],d7[]}, [r5]
 
         // Clamp
-        VMIN.F32     q8,  q8, q2
-        VMIN.F32     q9,  q9, q2
-        VMIN.F32    q10, q10, q2
-        VMIN.F32    q11, q11, q2
-        VMIN.F32    q12, q12, q2
-        VMIN.F32    q13, q13, q2
-        VMIN.F32    q14, q14, q2
-        VMIN.F32    q15, q15, q2
-        VMAX.F32     q8,  q8, q3
-        VMAX.F32     q9,  q9, q3
-        VMAX.F32    q10, q10, q3
-        VMAX.F32    q11, q11, q3
-        VMAX.F32    q12, q12, q3
-        VMAX.F32    q13, q13, q3
-        VMAX.F32    q14, q14, q3
-        VMAX.F32    q15, q15, q3
+        VMAX.F32     q8,  q8, q2
+        VMAX.F32     q9,  q9, q2
+        VMAX.F32    q10, q10, q2
+        VMAX.F32    q11, q11, q2
+        VMAX.F32    q12, q12, q2
+        VMAX.F32    q13, q13, q2
+        VMAX.F32    q14, q14, q2
+        VMAX.F32    q15, q15, q2
+        VMIN.F32     q8,  q8, q3
+        VMIN.F32     q9,  q9, q3
+        VMIN.F32    q10, q10, q3
+        VMIN.F32    q11, q11, q3
+        VMIN.F32    q12, q12, q3
+        VMIN.F32    q13, q13, q3
+        VMIN.F32    q14, q14, q3
+        VMIN.F32    q15, q15, q3
 
         // Store full 4 x 8
         BLO         10f
diff --git a/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a53.S
index 6b689e9..8706312 100644
--- a/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a53.S
@@ -339,23 +339,23 @@
 
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
 
         # Store full 4 x 8
         B.LO 8f
diff --git a/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a55.S b/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a55.S
index b52fcc4..5baaa0e 100644
--- a/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a55.S
+++ b/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a55.S
@@ -328,23 +328,23 @@
 
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
 
         # Store full 4 x 8
         B.LO 8f
diff --git a/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a57.S b/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a57.S
index b32d845..f14855c 100644
--- a/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a57.S
+++ b/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a57.S
@@ -378,23 +378,23 @@
 
 6:
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
+        FMAX v16.4s, v16.4s, v4.4s
         SUBS x1, x1, 8
-        FMIN v17.4s, v17.4s, v4.4s
-        FMIN v18.4s, v18.4s, v4.4s
-        FMIN v19.4s, v19.4s, v4.4s
-        FMIN v28.4s, v28.4s, v4.4s
-        FMIN v29.4s, v29.4s, v4.4s
-        FMIN v30.4s, v30.4s, v4.4s
-        FMIN v31.4s, v31.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
-        FMAX v18.4s, v18.4s, v5.4s
-        FMAX v19.4s, v19.4s, v5.4s
-        FMAX v28.4s, v28.4s, v5.4s
-        FMAX v29.4s, v29.4s, v5.4s
-        FMAX v30.4s, v30.4s, v5.4s
-        FMAX v31.4s, v31.4s, v5.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMAX v18.4s, v18.4s, v4.4s
+        FMAX v19.4s, v19.4s, v4.4s
+        FMAX v28.4s, v28.4s, v4.4s
+        FMAX v29.4s, v29.4s, v4.4s
+        FMAX v30.4s, v30.4s, v4.4s
+        FMAX v31.4s, v31.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
+        FMIN v18.4s, v18.4s, v5.4s
+        FMIN v19.4s, v19.4s, v5.4s
+        FMIN v28.4s, v28.4s, v5.4s
+        FMIN v29.4s, v29.4s, v5.4s
+        FMIN v30.4s, v30.4s, v5.4s
+        FMIN v31.4s, v31.4s, v5.4s
 
         # Store full 4 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a75.S b/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a75.S
index 51ecd94..d29f3d3 100644
--- a/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a75.S
+++ b/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a75.S
@@ -382,23 +382,23 @@
 
 6:
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
+        FMAX v16.4s, v16.4s, v4.4s
         SUBS x1, x1, 8
-        FMIN v17.4s, v17.4s, v4.4s
-        FMIN v18.4s, v18.4s, v4.4s
-        FMIN v19.4s, v19.4s, v4.4s
-        FMIN v28.4s, v28.4s, v4.4s
-        FMIN v29.4s, v29.4s, v4.4s
-        FMIN v30.4s, v30.4s, v4.4s
-        FMIN v31.4s, v31.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
-        FMAX v18.4s, v18.4s, v5.4s
-        FMAX v19.4s, v19.4s, v5.4s
-        FMAX v28.4s, v28.4s, v5.4s
-        FMAX v29.4s, v29.4s, v5.4s
-        FMAX v30.4s, v30.4s, v5.4s
-        FMAX v31.4s, v31.4s, v5.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMAX v18.4s, v18.4s, v4.4s
+        FMAX v19.4s, v19.4s, v4.4s
+        FMAX v28.4s, v28.4s, v4.4s
+        FMAX v29.4s, v29.4s, v4.4s
+        FMAX v30.4s, v30.4s, v4.4s
+        FMAX v31.4s, v31.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
+        FMIN v18.4s, v18.4s, v5.4s
+        FMIN v19.4s, v19.4s, v5.4s
+        FMIN v28.4s, v28.4s, v5.4s
+        FMIN v29.4s, v29.4s, v5.4s
+        FMIN v30.4s, v30.4s, v5.4s
+        FMIN v31.4s, v31.4s, v5.4s
 
         # Store full 4 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S b/src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S
index aa7e7a7..4b2e255 100644
--- a/src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S
+++ b/src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S
@@ -127,23 +127,23 @@
 
 4:
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
+        FMAX v16.4s, v16.4s, v4.4s
         SUBS x1, x1, 8
-        FMIN v17.4s, v17.4s, v4.4s
-        FMIN v18.4s, v18.4s, v4.4s
-        FMIN v19.4s, v19.4s, v4.4s
-        FMIN v28.4s, v28.4s, v4.4s
-        FMIN v29.4s, v29.4s, v4.4s
-        FMIN v30.4s, v30.4s, v4.4s
-        FMIN v31.4s, v31.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
-        FMAX v18.4s, v18.4s, v5.4s
-        FMAX v19.4s, v19.4s, v5.4s
-        FMAX v28.4s, v28.4s, v5.4s
-        FMAX v29.4s, v29.4s, v5.4s
-        FMAX v30.4s, v30.4s, v5.4s
-        FMAX v31.4s, v31.4s, v5.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMAX v18.4s, v18.4s, v4.4s
+        FMAX v19.4s, v19.4s, v4.4s
+        FMAX v28.4s, v28.4s, v4.4s
+        FMAX v29.4s, v29.4s, v4.4s
+        FMAX v30.4s, v30.4s, v4.4s
+        FMAX v31.4s, v31.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
+        FMIN v18.4s, v18.4s, v5.4s
+        FMIN v19.4s, v19.4s, v5.4s
+        FMIN v28.4s, v28.4s, v5.4s
+        FMIN v29.4s, v29.4s, v5.4s
+        FMIN v30.4s, v30.4s, v5.4s
+        FMIN v31.4s, v31.4s, v5.4s
 
         # Store full 4 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen/4x8-aarch64-neonfma-ld64.S b/src/f32-gemm/gen/4x8-aarch64-neonfma-ld64.S
index 548450f..c81a76a 100644
--- a/src/f32-gemm/gen/4x8-aarch64-neonfma-ld64.S
+++ b/src/f32-gemm/gen/4x8-aarch64-neonfma-ld64.S
@@ -109,23 +109,23 @@
 
 4:
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
+        FMAX v16.4s, v16.4s, v4.4s
         SUBS x1, x1, 8
-        FMIN v17.4s, v17.4s, v4.4s
-        FMIN v18.4s, v18.4s, v4.4s
-        FMIN v19.4s, v19.4s, v4.4s
-        FMIN v28.4s, v28.4s, v4.4s
-        FMIN v29.4s, v29.4s, v4.4s
-        FMIN v30.4s, v30.4s, v4.4s
-        FMIN v31.4s, v31.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
-        FMAX v18.4s, v18.4s, v5.4s
-        FMAX v19.4s, v19.4s, v5.4s
-        FMAX v28.4s, v28.4s, v5.4s
-        FMAX v29.4s, v29.4s, v5.4s
-        FMAX v30.4s, v30.4s, v5.4s
-        FMAX v31.4s, v31.4s, v5.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMAX v18.4s, v18.4s, v4.4s
+        FMAX v19.4s, v19.4s, v4.4s
+        FMAX v28.4s, v28.4s, v4.4s
+        FMAX v29.4s, v29.4s, v4.4s
+        FMAX v30.4s, v30.4s, v4.4s
+        FMAX v31.4s, v31.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
+        FMIN v18.4s, v18.4s, v5.4s
+        FMIN v19.4s, v19.4s, v5.4s
+        FMIN v28.4s, v28.4s, v5.4s
+        FMIN v29.4s, v29.4s, v5.4s
+        FMIN v30.4s, v30.4s, v5.4s
+        FMIN v31.4s, v31.4s, v5.4s
 
         # Store full 4 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen/5x8-aarch64-neonfma-cortex-a57.S b/src/f32-gemm/gen/5x8-aarch64-neonfma-cortex-a57.S
index 2aa21cc..4e841f2 100644
--- a/src/f32-gemm/gen/5x8-aarch64-neonfma-cortex-a57.S
+++ b/src/f32-gemm/gen/5x8-aarch64-neonfma-cortex-a57.S
@@ -349,27 +349,27 @@
 
         # Clamp
 3:
-        FMIN v20.4s, v20.4s, v30.4s
+        FMAX v20.4s, v20.4s, v30.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v30.4s
-        FMIN v22.4s, v22.4s, v30.4s
-        FMIN v23.4s, v23.4s, v30.4s
-        FMIN v24.4s, v24.4s, v30.4s
-        FMIN v25.4s, v25.4s, v30.4s
-        FMIN v26.4s, v26.4s, v30.4s
-        FMIN v27.4s, v27.4s, v30.4s
-        FMIN v28.4s, v28.4s, v30.4s
-        FMIN v29.4s, v29.4s, v30.4s
-        FMAX v20.4s, v20.4s, v31.4s
-        FMAX v21.4s, v21.4s, v31.4s
-        FMAX v22.4s, v22.4s, v31.4s
-        FMAX v23.4s, v23.4s, v31.4s
-        FMAX v24.4s, v24.4s, v31.4s
-        FMAX v25.4s, v25.4s, v31.4s
-        FMAX v26.4s, v26.4s, v31.4s
-        FMAX v27.4s, v27.4s, v31.4s
-        FMAX v28.4s, v28.4s, v31.4s
-        FMAX v29.4s, v29.4s, v31.4s
+        FMAX v21.4s, v21.4s, v30.4s
+        FMAX v22.4s, v22.4s, v30.4s
+        FMAX v23.4s, v23.4s, v30.4s
+        FMAX v24.4s, v24.4s, v30.4s
+        FMAX v25.4s, v25.4s, v30.4s
+        FMAX v26.4s, v26.4s, v30.4s
+        FMAX v27.4s, v27.4s, v30.4s
+        FMAX v28.4s, v28.4s, v30.4s
+        FMAX v29.4s, v29.4s, v30.4s
+        FMIN v20.4s, v20.4s, v31.4s
+        FMIN v21.4s, v21.4s, v31.4s
+        FMIN v22.4s, v22.4s, v31.4s
+        FMIN v23.4s, v23.4s, v31.4s
+        FMIN v24.4s, v24.4s, v31.4s
+        FMIN v25.4s, v25.4s, v31.4s
+        FMIN v26.4s, v26.4s, v31.4s
+        FMIN v27.4s, v27.4s, v31.4s
+        FMIN v28.4s, v28.4s, v31.4s
+        FMIN v29.4s, v29.4s, v31.4s
 
         # Store full 5 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen/5x8-aarch64-neonfma-cortex-a75.S b/src/f32-gemm/gen/5x8-aarch64-neonfma-cortex-a75.S
index 7b947fd..9db48c6 100644
--- a/src/f32-gemm/gen/5x8-aarch64-neonfma-cortex-a75.S
+++ b/src/f32-gemm/gen/5x8-aarch64-neonfma-cortex-a75.S
@@ -362,27 +362,27 @@
 
         # Clamp
 3:
-        FMIN v20.4s, v20.4s, v30.4s
+        FMAX v20.4s, v20.4s, v30.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v30.4s
-        FMIN v22.4s, v22.4s, v30.4s
-        FMIN v23.4s, v23.4s, v30.4s
-        FMIN v24.4s, v24.4s, v30.4s
-        FMIN v25.4s, v25.4s, v30.4s
-        FMIN v26.4s, v26.4s, v30.4s
-        FMIN v27.4s, v27.4s, v30.4s
-        FMIN v28.4s, v28.4s, v30.4s
-        FMIN v29.4s, v29.4s, v30.4s
-        FMAX v20.4s, v20.4s, v31.4s
-        FMAX v21.4s, v21.4s, v31.4s
-        FMAX v22.4s, v22.4s, v31.4s
-        FMAX v23.4s, v23.4s, v31.4s
-        FMAX v24.4s, v24.4s, v31.4s
-        FMAX v25.4s, v25.4s, v31.4s
-        FMAX v26.4s, v26.4s, v31.4s
-        FMAX v27.4s, v27.4s, v31.4s
-        FMAX v28.4s, v28.4s, v31.4s
-        FMAX v29.4s, v29.4s, v31.4s
+        FMAX v21.4s, v21.4s, v30.4s
+        FMAX v22.4s, v22.4s, v30.4s
+        FMAX v23.4s, v23.4s, v30.4s
+        FMAX v24.4s, v24.4s, v30.4s
+        FMAX v25.4s, v25.4s, v30.4s
+        FMAX v26.4s, v26.4s, v30.4s
+        FMAX v27.4s, v27.4s, v30.4s
+        FMAX v28.4s, v28.4s, v30.4s
+        FMAX v29.4s, v29.4s, v30.4s
+        FMIN v20.4s, v20.4s, v31.4s
+        FMIN v21.4s, v21.4s, v31.4s
+        FMIN v22.4s, v22.4s, v31.4s
+        FMIN v23.4s, v23.4s, v31.4s
+        FMIN v24.4s, v24.4s, v31.4s
+        FMIN v25.4s, v25.4s, v31.4s
+        FMIN v26.4s, v26.4s, v31.4s
+        FMIN v27.4s, v27.4s, v31.4s
+        FMIN v28.4s, v28.4s, v31.4s
+        FMIN v29.4s, v29.4s, v31.4s
 
         # Store full 5 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a53.S
index 1e80a60..db31f75 100644
--- a/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a53.S
@@ -413,31 +413,31 @@
         B.NE 5f
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 8f
diff --git a/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a55.S b/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a55.S
index f47f548..906343a 100644
--- a/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a55.S
+++ b/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a55.S
@@ -407,31 +407,31 @@
         B.NE 5f
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 8f
diff --git a/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a57.S b/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a57.S
index b230b51..806513b 100644
--- a/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a57.S
+++ b/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a57.S
@@ -394,31 +394,31 @@
 
         # Clamp
 3:
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a73.S b/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a73.S
index 4dda218..b9a4da0 100644
--- a/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a73.S
+++ b/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a73.S
@@ -406,30 +406,30 @@
         # Clamp
 3:
         SUBS x1, x1, 8
-        FMIN v20.4s, v20.4s, v6.4s
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v20.4s, v20.4s, v6.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         NOP
diff --git a/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a75.S b/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a75.S
index e65b6b4..68d7401 100644
--- a/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a75.S
+++ b/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a75.S
@@ -408,31 +408,31 @@
 
         # Clamp
 3:
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen/6x8-aarch64-neonfma-ios.S b/src/f32-gemm/gen/6x8-aarch64-neonfma-ios.S
index aa7b950..b857d44 100644
--- a/src/f32-gemm/gen/6x8-aarch64-neonfma-ios.S
+++ b/src/f32-gemm/gen/6x8-aarch64-neonfma-ios.S
@@ -394,33 +394,33 @@
 
         # Clamp
 3:
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
         # Load cn_stride
         LDR x0, [sp, 64]
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen/6x8-aarch64-neonfma-ld128.S b/src/f32-gemm/gen/6x8-aarch64-neonfma-ld128.S
index 7909ba2..48c67b4 100644
--- a/src/f32-gemm/gen/6x8-aarch64-neonfma-ld128.S
+++ b/src/f32-gemm/gen/6x8-aarch64-neonfma-ld128.S
@@ -201,31 +201,31 @@
 
 4:
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 7f
diff --git a/src/f32-gemm/gen/6x8-aarch64-neonfma-ld64.S b/src/f32-gemm/gen/6x8-aarch64-neonfma-ld64.S
index d946000..3844962 100644
--- a/src/f32-gemm/gen/6x8-aarch64-neonfma-ld64.S
+++ b/src/f32-gemm/gen/6x8-aarch64-neonfma-ld64.S
@@ -171,31 +171,31 @@
         TBNZ x0, 2, 4f
 3:
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 5f
diff --git a/src/f32-igemm/1x12-aarch64-neonfma-cortex-a53.S b/src/f32-igemm/1x12-aarch64-neonfma-cortex-a53.S
index 513f4c5..d09c073 100644
--- a/src/f32-igemm/1x12-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-igemm/1x12-aarch64-neonfma-cortex-a53.S
@@ -285,12 +285,12 @@
         B.HI 1b
 
         # Clamp
-        FMIN v20.4s, v20.4s, v30.4s
-        FMIN v21.4s, v21.4s, v30.4s
-        FMIN v22.4s, v22.4s, v30.4s
-        FMAX v20.4s, v20.4s, v31.4s
-        FMAX v21.4s, v21.4s, v31.4s
-        FMAX v22.4s, v22.4s, v31.4s
+        FMAX v20.4s, v20.4s, v30.4s
+        FMAX v21.4s, v21.4s, v30.4s
+        FMAX v22.4s, v22.4s, v30.4s
+        FMIN v20.4s, v20.4s, v31.4s
+        FMIN v21.4s, v21.4s, v31.4s
+        FMIN v22.4s, v22.4s, v31.4s
 
         # Store full 1 x 12
         SUBS x1, x1, 12
diff --git a/src/f32-igemm/1x8-aarch64-neonfma-cortex-a53.S b/src/f32-igemm/1x8-aarch64-neonfma-cortex-a53.S
index 08fb422..5c4245e 100644
--- a/src/f32-igemm/1x8-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-igemm/1x8-aarch64-neonfma-cortex-a53.S
@@ -163,10 +163,10 @@
         FADD v17.4s, v17.4s, v19.4s
 
         # Clamp
-        FMIN v16.4s, v16.4s, v30.4s
-        FMIN v17.4s, v17.4s, v30.4s
-        FMAX v16.4s, v16.4s, v31.4s
-        FMAX v17.4s, v17.4s, v31.4s
+        FMAX v16.4s, v16.4s, v30.4s
+        FMAX v17.4s, v17.4s, v30.4s
+        FMIN v16.4s, v16.4s, v31.4s
+        FMIN v17.4s, v17.4s, v31.4s
 
         # Store full 1 x 8
         SUBS x1, x1, 8
diff --git a/src/f32-igemm/1x8-aarch64-neonfma-cortex-a75.S.in b/src/f32-igemm/1x8-aarch64-neonfma-cortex-a75.S.in
index a2c691c..9b5a8be 100644
--- a/src/f32-igemm/1x8-aarch64-neonfma-cortex-a75.S.in
+++ b/src/f32-igemm/1x8-aarch64-neonfma-cortex-a75.S.in
@@ -168,10 +168,10 @@
         FADD v17.4s, v17.4s, v19.4s
 
         # Clamp
-        FMIN v16.4s, v16.4s, v30.4s
-        FMIN v17.4s, v17.4s, v30.4s
-        FMAX v16.4s, v16.4s, v31.4s
-        FMAX v17.4s, v17.4s, v31.4s
+        FMAX v16.4s, v16.4s, v30.4s
+        FMAX v17.4s, v17.4s, v30.4s
+        FMIN v16.4s, v16.4s, v31.4s
+        FMIN v17.4s, v17.4s, v31.4s
 
         # Store full 1 x 8
         SUBS x1, x1, 8
diff --git a/src/f32-igemm/4x12-aarch64-neonfma-cortex-a53.S b/src/f32-igemm/4x12-aarch64-neonfma-cortex-a53.S
index f64fe3e..daafff3 100644
--- a/src/f32-igemm/4x12-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-igemm/4x12-aarch64-neonfma-cortex-a53.S
@@ -425,31 +425,31 @@
         B.HI 1b
 
         # Clamp
-        FMIN v20.4s, v20.4s, v4.4s
+        FMAX v20.4s, v20.4s, v4.4s
         SUBS x1, x1, 12
-        FMIN v21.4s, v21.4s, v4.4s
-        FMIN v22.4s, v22.4s, v4.4s
-        FMIN v23.4s, v23.4s, v4.4s
-        FMIN v24.4s, v24.4s, v4.4s
-        FMIN v25.4s, v25.4s, v4.4s
-        FMIN v26.4s, v26.4s, v4.4s
-        FMIN v27.4s, v27.4s, v4.4s
-        FMIN v28.4s, v28.4s, v4.4s
-        FMIN v29.4s, v29.4s, v4.4s
-        FMIN v30.4s, v30.4s, v4.4s
-        FMIN v31.4s, v31.4s, v4.4s
-        FMAX v20.4s, v20.4s, v5.4s
-        FMAX v21.4s, v21.4s, v5.4s
-        FMAX v22.4s, v22.4s, v5.4s
-        FMAX v23.4s, v23.4s, v5.4s
-        FMAX v24.4s, v24.4s, v5.4s
-        FMAX v25.4s, v25.4s, v5.4s
-        FMAX v26.4s, v26.4s, v5.4s
-        FMAX v27.4s, v27.4s, v5.4s
-        FMAX v28.4s, v28.4s, v5.4s
-        FMAX v29.4s, v29.4s, v5.4s
-        FMAX v30.4s, v30.4s, v5.4s
-        FMAX v31.4s, v31.4s, v5.4s
+        FMAX v21.4s, v21.4s, v4.4s
+        FMAX v22.4s, v22.4s, v4.4s
+        FMAX v23.4s, v23.4s, v4.4s
+        FMAX v24.4s, v24.4s, v4.4s
+        FMAX v25.4s, v25.4s, v4.4s
+        FMAX v26.4s, v26.4s, v4.4s
+        FMAX v27.4s, v27.4s, v4.4s
+        FMAX v28.4s, v28.4s, v4.4s
+        FMAX v29.4s, v29.4s, v4.4s
+        FMAX v30.4s, v30.4s, v4.4s
+        FMAX v31.4s, v31.4s, v4.4s
+        FMIN v20.4s, v20.4s, v5.4s
+        FMIN v21.4s, v21.4s, v5.4s
+        FMIN v22.4s, v22.4s, v5.4s
+        FMIN v23.4s, v23.4s, v5.4s
+        FMIN v24.4s, v24.4s, v5.4s
+        FMIN v25.4s, v25.4s, v5.4s
+        FMIN v26.4s, v26.4s, v5.4s
+        FMIN v27.4s, v27.4s, v5.4s
+        FMIN v28.4s, v28.4s, v5.4s
+        FMIN v29.4s, v29.4s, v5.4s
+        FMIN v30.4s, v30.4s, v5.4s
+        FMIN v31.4s, v31.4s, v5.4s
 
         # Store full 4 x 12
         B.LO 8f
diff --git a/src/f32-igemm/4x8-aarch32-neon-cortex-a53.S b/src/f32-igemm/4x8-aarch32-neon-cortex-a53.S
index 9af12dc..d41584a 100644
--- a/src/f32-igemm/4x8-aarch32-neon-cortex-a53.S
+++ b/src/f32-igemm/4x8-aarch32-neon-cortex-a53.S
@@ -384,22 +384,22 @@
         VLD1.32     {d6[],d7[]}, [r2]
 
         // Clamp
-        VMIN.F32     q8,  q8, q2
-        VMIN.F32     q9,  q9, q2
-        VMIN.F32    q10, q10, q2
-        VMIN.F32    q11, q11, q2
-        VMIN.F32    q12, q12, q2
-        VMIN.F32    q13, q13, q2
-        VMIN.F32    q14, q14, q2
-        VMIN.F32    q15, q15, q2
-        VMAX.F32     q8,  q8, q3
-        VMAX.F32     q9,  q9, q3
-        VMAX.F32    q10, q10, q3
-        VMAX.F32    q11, q11, q3
-        VMAX.F32    q12, q12, q3
-        VMAX.F32    q13, q13, q3
-        VMAX.F32    q14, q14, q3
-        VMAX.F32    q15, q15, q3
+        VMAX.F32     q8,  q8, q2
+        VMAX.F32     q9,  q9, q2
+        VMAX.F32    q10, q10, q2
+        VMAX.F32    q11, q11, q2
+        VMAX.F32    q12, q12, q2
+        VMAX.F32    q13, q13, q2
+        VMAX.F32    q14, q14, q2
+        VMAX.F32    q15, q15, q2
+        VMIN.F32     q8,  q8, q3
+        VMIN.F32     q9,  q9, q3
+        VMIN.F32    q10, q10, q3
+        VMIN.F32    q11, q11, q3
+        VMIN.F32    q12, q12, q3
+        VMIN.F32    q13, q13, q3
+        VMIN.F32    q14, q14, q3
+        VMIN.F32    q15, q15, q3
 
         // Store full 4 x 8
         BLO         10f
diff --git a/src/f32-igemm/4x8-aarch32-neon-cortex-a55.S b/src/f32-igemm/4x8-aarch32-neon-cortex-a55.S
index 7e9ad16..1f34307 100644
--- a/src/f32-igemm/4x8-aarch32-neon-cortex-a55.S
+++ b/src/f32-igemm/4x8-aarch32-neon-cortex-a55.S
@@ -318,22 +318,22 @@
         LDR          r0, [sp, 120]   // cn_stride
 
         // Clamp
-        VMIN.F32     q8,  q8, q2
-        VMIN.F32     q9,  q9, q2
-        VMIN.F32    q10, q10, q2
-        VMIN.F32    q11, q11, q2
-        VMIN.F32    q12, q12, q2
-        VMIN.F32    q13, q13, q2
-        VMIN.F32    q14, q14, q2
-        VMIN.F32    q15, q15, q2
-        VMAX.F32     q8,  q8, q3
-        VMAX.F32     q9,  q9, q3
-        VMAX.F32    q10, q10, q3
-        VMAX.F32    q11, q11, q3
-        VMAX.F32    q12, q12, q3
-        VMAX.F32    q13, q13, q3
-        VMAX.F32    q14, q14, q3
-        VMAX.F32    q15, q15, q3
+        VMAX.F32     q8,  q8, q2
+        VMAX.F32     q9,  q9, q2
+        VMAX.F32    q10, q10, q2
+        VMAX.F32    q11, q11, q2
+        VMAX.F32    q12, q12, q2
+        VMAX.F32    q13, q13, q2
+        VMAX.F32    q14, q14, q2
+        VMAX.F32    q15, q15, q2
+        VMIN.F32     q8,  q8, q3
+        VMIN.F32     q9,  q9, q3
+        VMIN.F32    q10, q10, q3
+        VMIN.F32    q11, q11, q3
+        VMIN.F32    q12, q12, q3
+        VMIN.F32    q13, q13, q3
+        VMIN.F32    q14, q14, q3
+        VMIN.F32    q15, q15, q3
 
         // Store full 4 x 8
         BLO         10f
diff --git a/src/f32-igemm/4x8-aarch32-neon-cortex-a75.S.in b/src/f32-igemm/4x8-aarch32-neon-cortex-a75.S.in
index 7b44663..8c6a075 100644
--- a/src/f32-igemm/4x8-aarch32-neon-cortex-a75.S.in
+++ b/src/f32-igemm/4x8-aarch32-neon-cortex-a75.S.in
@@ -262,22 +262,22 @@
         VLD1.32     {d6[],d7[]}, [r5]
 
         // Clamp
-        VMIN.F32     q8,  q8, q2
-        VMIN.F32     q9,  q9, q2
-        VMIN.F32    q10, q10, q2
-        VMIN.F32    q11, q11, q2
-        VMIN.F32    q12, q12, q2
-        VMIN.F32    q13, q13, q2
-        VMIN.F32    q14, q14, q2
-        VMIN.F32    q15, q15, q2
-        VMAX.F32     q8,  q8, q3
-        VMAX.F32     q9,  q9, q3
-        VMAX.F32    q10, q10, q3
-        VMAX.F32    q11, q11, q3
-        VMAX.F32    q12, q12, q3
-        VMAX.F32    q13, q13, q3
-        VMAX.F32    q14, q14, q3
-        VMAX.F32    q15, q15, q3
+        VMAX.F32     q8,  q8, q2
+        VMAX.F32     q9,  q9, q2
+        VMAX.F32    q10, q10, q2
+        VMAX.F32    q11, q11, q2
+        VMAX.F32    q12, q12, q2
+        VMAX.F32    q13, q13, q2
+        VMAX.F32    q14, q14, q2
+        VMAX.F32    q15, q15, q2
+        VMIN.F32     q8,  q8, q3
+        VMIN.F32     q9,  q9, q3
+        VMIN.F32    q10, q10, q3
+        VMIN.F32    q11, q11, q3
+        VMIN.F32    q12, q12, q3
+        VMIN.F32    q13, q13, q3
+        VMIN.F32    q14, q14, q3
+        VMIN.F32    q15, q15, q3
 
         // Store full 4 x 8
         BLO         10f
diff --git a/src/f32-igemm/4x8-aarch32-neon-ld64.S b/src/f32-igemm/4x8-aarch32-neon-ld64.S
index 12ec655..5f65a78 100644
--- a/src/f32-igemm/4x8-aarch32-neon-ld64.S
+++ b/src/f32-igemm/4x8-aarch32-neon-ld64.S
@@ -151,23 +151,23 @@
         LDR         r14, [sp, 72]    // p = ks
 
         // Clamp
-        VMIN.F32     q8,  q8, q2
+        VMAX.F32     q8,  q8, q2
         SUBS         r1, r1, 8
-        VMIN.F32     q9,  q9, q2
-        VMIN.F32    q10, q10, q2
-        VMIN.F32    q11, q11, q2
-        VMIN.F32    q12, q12, q2
-        VMIN.F32    q13, q13, q2
-        VMIN.F32    q14, q14, q2
-        VMIN.F32    q15, q15, q2
-        VMAX.F32     q8,  q8, q3
-        VMAX.F32     q9,  q9, q3
-        VMAX.F32    q10, q10, q3
-        VMAX.F32    q11, q11, q3
-        VMAX.F32    q12, q12, q3
-        VMAX.F32    q13, q13, q3
-        VMAX.F32    q14, q14, q3
-        VMAX.F32    q15, q15, q3
+        VMAX.F32     q9,  q9, q2
+        VMAX.F32    q10, q10, q2
+        VMAX.F32    q11, q11, q2
+        VMAX.F32    q12, q12, q2
+        VMAX.F32    q13, q13, q2
+        VMAX.F32    q14, q14, q2
+        VMAX.F32    q15, q15, q2
+        VMIN.F32     q8,  q8, q3
+        VMIN.F32     q9,  q9, q3
+        VMIN.F32    q10, q10, q3
+        VMIN.F32    q11, q11, q3
+        VMIN.F32    q12, q12, q3
+        VMIN.F32    q13, q13, q3
+        VMIN.F32    q14, q14, q3
+        VMIN.F32    q15, q15, q3
 
         // Store full 4 x 8
         BLO         10f
diff --git a/src/f32-igemm/4x8-aarch64-neonfma-cortex-a53.S b/src/f32-igemm/4x8-aarch64-neonfma-cortex-a53.S
index 07353e4..17ff86e 100644
--- a/src/f32-igemm/4x8-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-igemm/4x8-aarch64-neonfma-cortex-a53.S
@@ -359,22 +359,22 @@
         B.HI 1b
 
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
+        FMAX v20.4s, v20.4s, v6.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
 
         # Store full 4 x 8
         SUBS x1, x1, 8
diff --git a/src/f32-igemm/4x8-aarch64-neonfma-cortex-a55.S b/src/f32-igemm/4x8-aarch64-neonfma-cortex-a55.S
index c40e931..c337997 100644
--- a/src/f32-igemm/4x8-aarch64-neonfma-cortex-a55.S
+++ b/src/f32-igemm/4x8-aarch64-neonfma-cortex-a55.S
@@ -350,22 +350,22 @@
         B.HI 1b
 
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
+        FMAX v20.4s, v20.4s, v6.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
 
         # Store full 4 x 8
         SUBS x1, x1, 8
diff --git a/src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S.in b/src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S.in
index e18f751..100163a 100644
--- a/src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S.in
+++ b/src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S.in
@@ -407,22 +407,22 @@
         B.HI 1b
 
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
-        FMIN v17.4s, v17.4s, v4.4s
-        FMIN v18.4s, v18.4s, v4.4s
-        FMIN v19.4s, v19.4s, v4.4s
-        FMIN v28.4s, v28.4s, v4.4s
-        FMIN v29.4s, v29.4s, v4.4s
-        FMIN v30.4s, v30.4s, v4.4s
-        FMIN v31.4s, v31.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
-        FMAX v18.4s, v18.4s, v5.4s
-        FMAX v19.4s, v19.4s, v5.4s
-        FMAX v28.4s, v28.4s, v5.4s
-        FMAX v29.4s, v29.4s, v5.4s
-        FMAX v30.4s, v30.4s, v5.4s
-        FMAX v31.4s, v31.4s, v5.4s
+        FMAX v16.4s, v16.4s, v4.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMAX v18.4s, v18.4s, v4.4s
+        FMAX v19.4s, v19.4s, v4.4s
+        FMAX v28.4s, v28.4s, v4.4s
+        FMAX v29.4s, v29.4s, v4.4s
+        FMAX v30.4s, v30.4s, v4.4s
+        FMAX v31.4s, v31.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
+        FMIN v18.4s, v18.4s, v5.4s
+        FMIN v19.4s, v19.4s, v5.4s
+        FMIN v28.4s, v28.4s, v5.4s
+        FMIN v29.4s, v29.4s, v5.4s
+        FMIN v30.4s, v30.4s, v5.4s
+        FMIN v31.4s, v31.4s, v5.4s
 
         # Store full 4 x 8
         SUBS x1, x1, 8
diff --git a/src/f32-igemm/5x8-aarch64-neonfma-cortex-a75.S.in b/src/f32-igemm/5x8-aarch64-neonfma-cortex-a75.S.in
index 9a0bd81..0979684 100644
--- a/src/f32-igemm/5x8-aarch64-neonfma-cortex-a75.S.in
+++ b/src/f32-igemm/5x8-aarch64-neonfma-cortex-a75.S.in
@@ -386,26 +386,26 @@
         B.HI 1b
 
         # Clamp
-        FMIN v20.4s, v20.4s, v30.4s
-        FMIN v21.4s, v21.4s, v30.4s
-        FMIN v22.4s, v22.4s, v30.4s
-        FMIN v23.4s, v23.4s, v30.4s
-        FMIN v24.4s, v24.4s, v30.4s
-        FMIN v25.4s, v25.4s, v30.4s
-        FMIN v26.4s, v26.4s, v30.4s
-        FMIN v27.4s, v27.4s, v30.4s
-        FMIN v28.4s, v28.4s, v30.4s
-        FMIN v29.4s, v29.4s, v30.4s
-        FMAX v20.4s, v20.4s, v31.4s
-        FMAX v21.4s, v21.4s, v31.4s
-        FMAX v22.4s, v22.4s, v31.4s
-        FMAX v23.4s, v23.4s, v31.4s
-        FMAX v24.4s, v24.4s, v31.4s
-        FMAX v25.4s, v25.4s, v31.4s
-        FMAX v26.4s, v26.4s, v31.4s
-        FMAX v27.4s, v27.4s, v31.4s
-        FMAX v28.4s, v28.4s, v31.4s
-        FMAX v29.4s, v29.4s, v31.4s
+        FMAX v20.4s, v20.4s, v30.4s
+        FMAX v21.4s, v21.4s, v30.4s
+        FMAX v22.4s, v22.4s, v30.4s
+        FMAX v23.4s, v23.4s, v30.4s
+        FMAX v24.4s, v24.4s, v30.4s
+        FMAX v25.4s, v25.4s, v30.4s
+        FMAX v26.4s, v26.4s, v30.4s
+        FMAX v27.4s, v27.4s, v30.4s
+        FMAX v28.4s, v28.4s, v30.4s
+        FMAX v29.4s, v29.4s, v30.4s
+        FMIN v20.4s, v20.4s, v31.4s
+        FMIN v21.4s, v21.4s, v31.4s
+        FMIN v22.4s, v22.4s, v31.4s
+        FMIN v23.4s, v23.4s, v31.4s
+        FMIN v24.4s, v24.4s, v31.4s
+        FMIN v25.4s, v25.4s, v31.4s
+        FMIN v26.4s, v26.4s, v31.4s
+        FMIN v27.4s, v27.4s, v31.4s
+        FMIN v28.4s, v28.4s, v31.4s
+        FMIN v29.4s, v29.4s, v31.4s
 
         # Store full 5 x 8
         SUBS x1, x1, 8
diff --git a/src/f32-igemm/6x8-aarch64-neonfma-cortex-a53.S b/src/f32-igemm/6x8-aarch64-neonfma-cortex-a53.S
index 76ae5ad..9dd04ee 100644
--- a/src/f32-igemm/6x8-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-igemm/6x8-aarch64-neonfma-cortex-a53.S
@@ -425,31 +425,31 @@
         B.HI 1b
 
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 8f
diff --git a/src/f32-igemm/6x8-aarch64-neonfma-cortex-a55.S b/src/f32-igemm/6x8-aarch64-neonfma-cortex-a55.S
index bbc12e3..6e3fa33 100644
--- a/src/f32-igemm/6x8-aarch64-neonfma-cortex-a55.S
+++ b/src/f32-igemm/6x8-aarch64-neonfma-cortex-a55.S
@@ -419,31 +419,31 @@
         B.HI 1b
 
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
         SUBS x1, x1, 8
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         B.LO 8f
diff --git a/src/f32-igemm/6x8-aarch64-neonfma-cortex-a73.S b/src/f32-igemm/6x8-aarch64-neonfma-cortex-a73.S
index c79c1d5..197fd4f 100644
--- a/src/f32-igemm/6x8-aarch64-neonfma-cortex-a73.S
+++ b/src/f32-igemm/6x8-aarch64-neonfma-cortex-a73.S
@@ -427,30 +427,30 @@
         B.HI 1b
 
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v20.4s, v20.4s, v6.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         SUBS x1, x1, 8
diff --git a/src/f32-igemm/6x8-aarch64-neonfma-cortex-a75.S.in b/src/f32-igemm/6x8-aarch64-neonfma-cortex-a75.S.in
index 50666b2..48257d9 100644
--- a/src/f32-igemm/6x8-aarch64-neonfma-cortex-a75.S.in
+++ b/src/f32-igemm/6x8-aarch64-neonfma-cortex-a75.S.in
@@ -433,30 +433,30 @@
         B.HI 1b
 
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v20.4s, v20.4s, v6.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         SUBS x1, x1, 8
diff --git a/src/f32-igemm/6x8-aarch64-neonfma-ios.S.in b/src/f32-igemm/6x8-aarch64-neonfma-ios.S.in
index 2ab6764..7dbf53f 100644
--- a/src/f32-igemm/6x8-aarch64-neonfma-ios.S.in
+++ b/src/f32-igemm/6x8-aarch64-neonfma-ios.S.in
@@ -433,32 +433,32 @@
         B.HI 1b
 
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
         # Load cn_stride
         LDR x0, [sp, 96]
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         SUBS x1, x1, 8
diff --git a/src/f32-igemm/gen/1x8-aarch64-neonfma-cortex-a57.S b/src/f32-igemm/gen/1x8-aarch64-neonfma-cortex-a57.S
index 095656a..f1d935b 100644
--- a/src/f32-igemm/gen/1x8-aarch64-neonfma-cortex-a57.S
+++ b/src/f32-igemm/gen/1x8-aarch64-neonfma-cortex-a57.S
@@ -154,10 +154,10 @@
         FADD v17.4s, v17.4s, v19.4s
 
         # Clamp
-        FMIN v16.4s, v16.4s, v30.4s
-        FMIN v17.4s, v17.4s, v30.4s
-        FMAX v16.4s, v16.4s, v31.4s
-        FMAX v17.4s, v17.4s, v31.4s
+        FMAX v16.4s, v16.4s, v30.4s
+        FMAX v17.4s, v17.4s, v30.4s
+        FMIN v16.4s, v16.4s, v31.4s
+        FMIN v17.4s, v17.4s, v31.4s
 
         # Store full 1 x 8
         SUBS x1, x1, 8
diff --git a/src/f32-igemm/gen/1x8-aarch64-neonfma-cortex-a75.S b/src/f32-igemm/gen/1x8-aarch64-neonfma-cortex-a75.S
index 5e006e8..9f2238a 100644
--- a/src/f32-igemm/gen/1x8-aarch64-neonfma-cortex-a75.S
+++ b/src/f32-igemm/gen/1x8-aarch64-neonfma-cortex-a75.S
@@ -164,10 +164,10 @@
         FADD v17.4s, v17.4s, v19.4s
 
         # Clamp
-        FMIN v16.4s, v16.4s, v30.4s
-        FMIN v17.4s, v17.4s, v30.4s
-        FMAX v16.4s, v16.4s, v31.4s
-        FMAX v17.4s, v17.4s, v31.4s
+        FMAX v16.4s, v16.4s, v30.4s
+        FMAX v17.4s, v17.4s, v30.4s
+        FMIN v16.4s, v16.4s, v31.4s
+        FMIN v17.4s, v17.4s, v31.4s
 
         # Store full 1 x 8
         SUBS x1, x1, 8
diff --git a/src/f32-igemm/gen/4x8-aarch32-neon-cortex-a75.S b/src/f32-igemm/gen/4x8-aarch32-neon-cortex-a75.S
index 9579888..5abd65d 100644
--- a/src/f32-igemm/gen/4x8-aarch32-neon-cortex-a75.S
+++ b/src/f32-igemm/gen/4x8-aarch32-neon-cortex-a75.S
@@ -238,22 +238,22 @@
         VLD1.32     {d6[],d7[]}, [r5]
 
         // Clamp
-        VMIN.F32     q8,  q8, q2
-        VMIN.F32     q9,  q9, q2
-        VMIN.F32    q10, q10, q2
-        VMIN.F32    q11, q11, q2
-        VMIN.F32    q12, q12, q2
-        VMIN.F32    q13, q13, q2
-        VMIN.F32    q14, q14, q2
-        VMIN.F32    q15, q15, q2
-        VMAX.F32     q8,  q8, q3
-        VMAX.F32     q9,  q9, q3
-        VMAX.F32    q10, q10, q3
-        VMAX.F32    q11, q11, q3
-        VMAX.F32    q12, q12, q3
-        VMAX.F32    q13, q13, q3
-        VMAX.F32    q14, q14, q3
-        VMAX.F32    q15, q15, q3
+        VMAX.F32     q8,  q8, q2
+        VMAX.F32     q9,  q9, q2
+        VMAX.F32    q10, q10, q2
+        VMAX.F32    q11, q11, q2
+        VMAX.F32    q12, q12, q2
+        VMAX.F32    q13, q13, q2
+        VMAX.F32    q14, q14, q2
+        VMAX.F32    q15, q15, q2
+        VMIN.F32     q8,  q8, q3
+        VMIN.F32     q9,  q9, q3
+        VMIN.F32    q10, q10, q3
+        VMIN.F32    q11, q11, q3
+        VMIN.F32    q12, q12, q3
+        VMIN.F32    q13, q13, q3
+        VMIN.F32    q14, q14, q3
+        VMIN.F32    q15, q15, q3
 
         // Store full 4 x 8
         BLO         10f
diff --git a/src/f32-igemm/gen/4x8-aarch32-neon-pld-cortex-a75.S b/src/f32-igemm/gen/4x8-aarch32-neon-pld-cortex-a75.S
index 436296f..cfe5f96 100644
--- a/src/f32-igemm/gen/4x8-aarch32-neon-pld-cortex-a75.S
+++ b/src/f32-igemm/gen/4x8-aarch32-neon-pld-cortex-a75.S
@@ -258,22 +258,22 @@
         VLD1.32     {d6[],d7[]}, [r5]
 
         // Clamp
-        VMIN.F32     q8,  q8, q2
-        VMIN.F32     q9,  q9, q2
-        VMIN.F32    q10, q10, q2
-        VMIN.F32    q11, q11, q2
-        VMIN.F32    q12, q12, q2
-        VMIN.F32    q13, q13, q2
-        VMIN.F32    q14, q14, q2
-        VMIN.F32    q15, q15, q2
-        VMAX.F32     q8,  q8, q3
-        VMAX.F32     q9,  q9, q3
-        VMAX.F32    q10, q10, q3
-        VMAX.F32    q11, q11, q3
-        VMAX.F32    q12, q12, q3
-        VMAX.F32    q13, q13, q3
-        VMAX.F32    q14, q14, q3
-        VMAX.F32    q15, q15, q3
+        VMAX.F32     q8,  q8, q2
+        VMAX.F32     q9,  q9, q2
+        VMAX.F32    q10, q10, q2
+        VMAX.F32    q11, q11, q2
+        VMAX.F32    q12, q12, q2
+        VMAX.F32    q13, q13, q2
+        VMAX.F32    q14, q14, q2
+        VMAX.F32    q15, q15, q2
+        VMIN.F32     q8,  q8, q3
+        VMIN.F32     q9,  q9, q3
+        VMIN.F32    q10, q10, q3
+        VMIN.F32    q11, q11, q3
+        VMIN.F32    q12, q12, q3
+        VMIN.F32    q13, q13, q3
+        VMIN.F32    q14, q14, q3
+        VMIN.F32    q15, q15, q3
 
         // Store full 4 x 8
         BLO         10f
diff --git a/src/f32-igemm/gen/4x8-aarch64-neonfma-cortex-a57.S b/src/f32-igemm/gen/4x8-aarch64-neonfma-cortex-a57.S
index f76061e..aaaf82f 100644
--- a/src/f32-igemm/gen/4x8-aarch64-neonfma-cortex-a57.S
+++ b/src/f32-igemm/gen/4x8-aarch64-neonfma-cortex-a57.S
@@ -403,22 +403,22 @@
         B.HI 1b
 
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
-        FMIN v17.4s, v17.4s, v4.4s
-        FMIN v18.4s, v18.4s, v4.4s
-        FMIN v19.4s, v19.4s, v4.4s
-        FMIN v28.4s, v28.4s, v4.4s
-        FMIN v29.4s, v29.4s, v4.4s
-        FMIN v30.4s, v30.4s, v4.4s
-        FMIN v31.4s, v31.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
-        FMAX v18.4s, v18.4s, v5.4s
-        FMAX v19.4s, v19.4s, v5.4s
-        FMAX v28.4s, v28.4s, v5.4s
-        FMAX v29.4s, v29.4s, v5.4s
-        FMAX v30.4s, v30.4s, v5.4s
-        FMAX v31.4s, v31.4s, v5.4s
+        FMAX v16.4s, v16.4s, v4.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMAX v18.4s, v18.4s, v4.4s
+        FMAX v19.4s, v19.4s, v4.4s
+        FMAX v28.4s, v28.4s, v4.4s
+        FMAX v29.4s, v29.4s, v4.4s
+        FMAX v30.4s, v30.4s, v4.4s
+        FMAX v31.4s, v31.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
+        FMIN v18.4s, v18.4s, v5.4s
+        FMIN v19.4s, v19.4s, v5.4s
+        FMIN v28.4s, v28.4s, v5.4s
+        FMIN v29.4s, v29.4s, v5.4s
+        FMIN v30.4s, v30.4s, v5.4s
+        FMIN v31.4s, v31.4s, v5.4s
 
         # Store full 4 x 8
         SUBS x1, x1, 8
diff --git a/src/f32-igemm/gen/4x8-aarch64-neonfma-cortex-a75.S b/src/f32-igemm/gen/4x8-aarch64-neonfma-cortex-a75.S
index 03f29a0..ccf8645 100644
--- a/src/f32-igemm/gen/4x8-aarch64-neonfma-cortex-a75.S
+++ b/src/f32-igemm/gen/4x8-aarch64-neonfma-cortex-a75.S
@@ -407,22 +407,22 @@
         B.HI 1b
 
         # Clamp
-        FMIN v16.4s, v16.4s, v4.4s
-        FMIN v17.4s, v17.4s, v4.4s
-        FMIN v18.4s, v18.4s, v4.4s
-        FMIN v19.4s, v19.4s, v4.4s
-        FMIN v28.4s, v28.4s, v4.4s
-        FMIN v29.4s, v29.4s, v4.4s
-        FMIN v30.4s, v30.4s, v4.4s
-        FMIN v31.4s, v31.4s, v4.4s
-        FMAX v16.4s, v16.4s, v5.4s
-        FMAX v17.4s, v17.4s, v5.4s
-        FMAX v18.4s, v18.4s, v5.4s
-        FMAX v19.4s, v19.4s, v5.4s
-        FMAX v28.4s, v28.4s, v5.4s
-        FMAX v29.4s, v29.4s, v5.4s
-        FMAX v30.4s, v30.4s, v5.4s
-        FMAX v31.4s, v31.4s, v5.4s
+        FMAX v16.4s, v16.4s, v4.4s
+        FMAX v17.4s, v17.4s, v4.4s
+        FMAX v18.4s, v18.4s, v4.4s
+        FMAX v19.4s, v19.4s, v4.4s
+        FMAX v28.4s, v28.4s, v4.4s
+        FMAX v29.4s, v29.4s, v4.4s
+        FMAX v30.4s, v30.4s, v4.4s
+        FMAX v31.4s, v31.4s, v4.4s
+        FMIN v16.4s, v16.4s, v5.4s
+        FMIN v17.4s, v17.4s, v5.4s
+        FMIN v18.4s, v18.4s, v5.4s
+        FMIN v19.4s, v19.4s, v5.4s
+        FMIN v28.4s, v28.4s, v5.4s
+        FMIN v29.4s, v29.4s, v5.4s
+        FMIN v30.4s, v30.4s, v5.4s
+        FMIN v31.4s, v31.4s, v5.4s
 
         # Store full 4 x 8
         SUBS x1, x1, 8
diff --git a/src/f32-igemm/gen/5x8-aarch64-neonfma-cortex-a57.S b/src/f32-igemm/gen/5x8-aarch64-neonfma-cortex-a57.S
index 4812a2c..250bec3 100644
--- a/src/f32-igemm/gen/5x8-aarch64-neonfma-cortex-a57.S
+++ b/src/f32-igemm/gen/5x8-aarch64-neonfma-cortex-a57.S
@@ -374,26 +374,26 @@
         B.HI 1b
 
         # Clamp
-        FMIN v20.4s, v20.4s, v30.4s
-        FMIN v21.4s, v21.4s, v30.4s
-        FMIN v22.4s, v22.4s, v30.4s
-        FMIN v23.4s, v23.4s, v30.4s
-        FMIN v24.4s, v24.4s, v30.4s
-        FMIN v25.4s, v25.4s, v30.4s
-        FMIN v26.4s, v26.4s, v30.4s
-        FMIN v27.4s, v27.4s, v30.4s
-        FMIN v28.4s, v28.4s, v30.4s
-        FMIN v29.4s, v29.4s, v30.4s
-        FMAX v20.4s, v20.4s, v31.4s
-        FMAX v21.4s, v21.4s, v31.4s
-        FMAX v22.4s, v22.4s, v31.4s
-        FMAX v23.4s, v23.4s, v31.4s
-        FMAX v24.4s, v24.4s, v31.4s
-        FMAX v25.4s, v25.4s, v31.4s
-        FMAX v26.4s, v26.4s, v31.4s
-        FMAX v27.4s, v27.4s, v31.4s
-        FMAX v28.4s, v28.4s, v31.4s
-        FMAX v29.4s, v29.4s, v31.4s
+        FMAX v20.4s, v20.4s, v30.4s
+        FMAX v21.4s, v21.4s, v30.4s
+        FMAX v22.4s, v22.4s, v30.4s
+        FMAX v23.4s, v23.4s, v30.4s
+        FMAX v24.4s, v24.4s, v30.4s
+        FMAX v25.4s, v25.4s, v30.4s
+        FMAX v26.4s, v26.4s, v30.4s
+        FMAX v27.4s, v27.4s, v30.4s
+        FMAX v28.4s, v28.4s, v30.4s
+        FMAX v29.4s, v29.4s, v30.4s
+        FMIN v20.4s, v20.4s, v31.4s
+        FMIN v21.4s, v21.4s, v31.4s
+        FMIN v22.4s, v22.4s, v31.4s
+        FMIN v23.4s, v23.4s, v31.4s
+        FMIN v24.4s, v24.4s, v31.4s
+        FMIN v25.4s, v25.4s, v31.4s
+        FMIN v26.4s, v26.4s, v31.4s
+        FMIN v27.4s, v27.4s, v31.4s
+        FMIN v28.4s, v28.4s, v31.4s
+        FMIN v29.4s, v29.4s, v31.4s
 
         # Store full 5 x 8
         SUBS x1, x1, 8
diff --git a/src/f32-igemm/gen/5x8-aarch64-neonfma-cortex-a75.S b/src/f32-igemm/gen/5x8-aarch64-neonfma-cortex-a75.S
index 54f1b39..4f64f28 100644
--- a/src/f32-igemm/gen/5x8-aarch64-neonfma-cortex-a75.S
+++ b/src/f32-igemm/gen/5x8-aarch64-neonfma-cortex-a75.S
@@ -382,26 +382,26 @@
         B.HI 1b
 
         # Clamp
-        FMIN v20.4s, v20.4s, v30.4s
-        FMIN v21.4s, v21.4s, v30.4s
-        FMIN v22.4s, v22.4s, v30.4s
-        FMIN v23.4s, v23.4s, v30.4s
-        FMIN v24.4s, v24.4s, v30.4s
-        FMIN v25.4s, v25.4s, v30.4s
-        FMIN v26.4s, v26.4s, v30.4s
-        FMIN v27.4s, v27.4s, v30.4s
-        FMIN v28.4s, v28.4s, v30.4s
-        FMIN v29.4s, v29.4s, v30.4s
-        FMAX v20.4s, v20.4s, v31.4s
-        FMAX v21.4s, v21.4s, v31.4s
-        FMAX v22.4s, v22.4s, v31.4s
-        FMAX v23.4s, v23.4s, v31.4s
-        FMAX v24.4s, v24.4s, v31.4s
-        FMAX v25.4s, v25.4s, v31.4s
-        FMAX v26.4s, v26.4s, v31.4s
-        FMAX v27.4s, v27.4s, v31.4s
-        FMAX v28.4s, v28.4s, v31.4s
-        FMAX v29.4s, v29.4s, v31.4s
+        FMAX v20.4s, v20.4s, v30.4s
+        FMAX v21.4s, v21.4s, v30.4s
+        FMAX v22.4s, v22.4s, v30.4s
+        FMAX v23.4s, v23.4s, v30.4s
+        FMAX v24.4s, v24.4s, v30.4s
+        FMAX v25.4s, v25.4s, v30.4s
+        FMAX v26.4s, v26.4s, v30.4s
+        FMAX v27.4s, v27.4s, v30.4s
+        FMAX v28.4s, v28.4s, v30.4s
+        FMAX v29.4s, v29.4s, v30.4s
+        FMIN v20.4s, v20.4s, v31.4s
+        FMIN v21.4s, v21.4s, v31.4s
+        FMIN v22.4s, v22.4s, v31.4s
+        FMIN v23.4s, v23.4s, v31.4s
+        FMIN v24.4s, v24.4s, v31.4s
+        FMIN v25.4s, v25.4s, v31.4s
+        FMIN v26.4s, v26.4s, v31.4s
+        FMIN v27.4s, v27.4s, v31.4s
+        FMIN v28.4s, v28.4s, v31.4s
+        FMIN v29.4s, v29.4s, v31.4s
 
         # Store full 5 x 8
         SUBS x1, x1, 8
diff --git a/src/f32-igemm/gen/6x8-aarch64-neonfma-cortex-a57.S b/src/f32-igemm/gen/6x8-aarch64-neonfma-cortex-a57.S
index e514086..f4dfd88 100644
--- a/src/f32-igemm/gen/6x8-aarch64-neonfma-cortex-a57.S
+++ b/src/f32-igemm/gen/6x8-aarch64-neonfma-cortex-a57.S
@@ -421,30 +421,30 @@
         B.HI 1b
 
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v20.4s, v20.4s, v6.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         SUBS x1, x1, 8
diff --git a/src/f32-igemm/gen/6x8-aarch64-neonfma-cortex-a75.S b/src/f32-igemm/gen/6x8-aarch64-neonfma-cortex-a75.S
index e4fa7f5..ec8403c 100644
--- a/src/f32-igemm/gen/6x8-aarch64-neonfma-cortex-a75.S
+++ b/src/f32-igemm/gen/6x8-aarch64-neonfma-cortex-a75.S
@@ -429,30 +429,30 @@
         B.HI 1b
 
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMAX v20.4s, v20.4s, v6.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         SUBS x1, x1, 8
diff --git a/src/f32-igemm/gen/6x8-aarch64-neonfma-ios.S b/src/f32-igemm/gen/6x8-aarch64-neonfma-ios.S
index 5b30b51..c280951 100644
--- a/src/f32-igemm/gen/6x8-aarch64-neonfma-ios.S
+++ b/src/f32-igemm/gen/6x8-aarch64-neonfma-ios.S
@@ -421,32 +421,32 @@
         B.HI 1b
 
         # Clamp
-        FMIN v20.4s, v20.4s, v6.4s
-        FMIN v21.4s, v21.4s, v6.4s
-        FMIN v22.4s, v22.4s, v6.4s
-        FMIN v23.4s, v23.4s, v6.4s
-        FMIN v24.4s, v24.4s, v6.4s
-        FMIN v25.4s, v25.4s, v6.4s
-        FMIN v26.4s, v26.4s, v6.4s
-        FMIN v27.4s, v27.4s, v6.4s
-        FMIN v28.4s, v28.4s, v6.4s
-        FMIN v29.4s, v29.4s, v6.4s
-        FMIN v30.4s, v30.4s, v6.4s
-        FMIN v31.4s, v31.4s, v6.4s
+        FMAX v20.4s, v20.4s, v6.4s
+        FMAX v21.4s, v21.4s, v6.4s
+        FMAX v22.4s, v22.4s, v6.4s
+        FMAX v23.4s, v23.4s, v6.4s
+        FMAX v24.4s, v24.4s, v6.4s
+        FMAX v25.4s, v25.4s, v6.4s
+        FMAX v26.4s, v26.4s, v6.4s
+        FMAX v27.4s, v27.4s, v6.4s
+        FMAX v28.4s, v28.4s, v6.4s
+        FMAX v29.4s, v29.4s, v6.4s
+        FMAX v30.4s, v30.4s, v6.4s
+        FMAX v31.4s, v31.4s, v6.4s
         # Load cn_stride
         LDR x0, [sp, 96]
-        FMAX v20.4s, v20.4s, v7.4s
-        FMAX v21.4s, v21.4s, v7.4s
-        FMAX v22.4s, v22.4s, v7.4s
-        FMAX v23.4s, v23.4s, v7.4s
-        FMAX v24.4s, v24.4s, v7.4s
-        FMAX v25.4s, v25.4s, v7.4s
-        FMAX v26.4s, v26.4s, v7.4s
-        FMAX v27.4s, v27.4s, v7.4s
-        FMAX v28.4s, v28.4s, v7.4s
-        FMAX v29.4s, v29.4s, v7.4s
-        FMAX v30.4s, v30.4s, v7.4s
-        FMAX v31.4s, v31.4s, v7.4s
+        FMIN v20.4s, v20.4s, v7.4s
+        FMIN v21.4s, v21.4s, v7.4s
+        FMIN v22.4s, v22.4s, v7.4s
+        FMIN v23.4s, v23.4s, v7.4s
+        FMIN v24.4s, v24.4s, v7.4s
+        FMIN v25.4s, v25.4s, v7.4s
+        FMIN v26.4s, v26.4s, v7.4s
+        FMIN v27.4s, v27.4s, v7.4s
+        FMIN v28.4s, v28.4s, v7.4s
+        FMIN v29.4s, v29.4s, v7.4s
+        FMIN v30.4s, v30.4s, v7.4s
+        FMIN v31.4s, v31.4s, v7.4s
 
         # Store full 6 x 8
         SUBS x1, x1, 8
diff --git a/src/q8-dwconv/up8x9-aarch32-neon.S b/src/q8-dwconv/up8x9-aarch32-neon.S
index 45a6980..e0ebcc1 100644
--- a/src/q8-dwconv/up8x9-aarch32-neon.S
+++ b/src/q8-dwconv/up8x9-aarch32-neon.S
@@ -62,12 +62,12 @@
 	# - q11 = vzero_shift_mask
 	VCEQ.S32 q11, q13, 0
 
-	# Load output max:
-	# - d20 = voutput_max
+	# Load output min:
+	# - d20 = voutput_min
 	VLD1.8 {d20[]}, [r12]!
 
-	# Load output min:
-	# - d21 = voutput_min
+	# Load output max:
+	# - d21 = voutput_max
 	VLD1.8 {d21[]}, [r12]
 
 	.p2align 3
@@ -207,8 +207,8 @@
 
 	VQADD.S16 q0, q12
 	VQMOVUN.S16 d0, q0
-	VMIN.U8 d0, d0, d20
-	VMAX.U8 d0, d0, d21
+	VMAX.U8 d0, d0, d20
+	VMIN.U8 d0, d0, d21
 
 	VST1.8 {d0}, [lr]!
 	SUBS r0, r0, 8
@@ -318,8 +318,8 @@
 
 	VQADD.S16 q0, q12
 	VQMOVUN.S16 d0, q0
-	VMIN.U8 d0, d0, d20
-	VMAX.U8 d0, d0, d21
+	VMAX.U8 d0, d0, d20
+	VMIN.U8 d0, d0, d21
 
 	TST r0, 4
 	BEQ 3f
diff --git a/src/xnnpack/params-init.h b/src/xnnpack/params-init.h
index 3d50e90..8a9b88f 100644
--- a/src/xnnpack/params-init.h
+++ b/src/xnnpack/params-init.h
@@ -114,8 +114,8 @@
       params.sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
     }
     for (uint32_t i = 0; i < 16; i++) {
-      params.sse2.output_max[i] = output_max;
       params.sse2.output_min[i] = output_min;
+      params.sse2.output_max[i] = output_max;
     }
   #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
     params.neon.input_zero_point = (int16_t) (uint16_t) input_zero_point;
@@ -123,8 +123,8 @@
     params.neon.multiplier = multiplier;
     params.neon.right_shift = -shift;
     params.neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
-    params.neon.output_max = output_max;
     params.neon.output_min = output_min;
+    params.neon.output_max = output_max;
   #else
     const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
     const uint32_t remainder_threshold = remainder_mask >> 1;
@@ -185,16 +185,16 @@
       params.sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
     }
     for (uint32_t i = 0; i < 16; i++) {
-      params.sse2.output_max[i] = output_max;
       params.sse2.output_min[i] = output_min;
+      params.sse2.output_max[i] = output_max;
     }
   #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
     params.neon.bias = bias;
     params.neon.multiplier = multiplier;
     params.neon.left_shift = (int64_t) -shift;
     params.neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
-    params.neon.output_max = output_max;
     params.neon.output_min = output_min;
+    params.neon.output_max = output_max;
   #else
     const uint32_t right_shift = (uint32_t) shift;
     const int64_t rounding = INT64_C(1) << (right_shift - 1);
@@ -432,8 +432,8 @@
   union xnn_f32_spchw_params params;
   #if XNN_ARCH_X86 || XNN_ARCH_X86_64
     for (uint32_t i = 0; i < 4; i++) {
-      params.sse.max[i] = output_max;
       params.sse.min[i] = output_min;
+      params.sse.max[i] = output_max;
     }
 
     const uint32_t w4 = (width - 1) & 3;
@@ -452,8 +452,8 @@
     params.sse.mask_odd[2] = -(uint32_t) (w8 >= 5);
     params.sse.mask_odd[3] = -(uint32_t) (w8 >= 7);
   #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
-    params.neon.max = output_max;
     params.neon.min = output_min;
+    params.neon.max = output_max;
 
     const uint32_t w4 = (width - 1) & 3;
     params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
@@ -471,8 +471,8 @@
     params.neon.mask_odd[2] = -(uint32_t) (w8 >= 5);
     params.neon.mask_odd[3] = -(uint32_t) (w8 >= 7);
   #else
-    params.scalar.max = output_max;
     params.scalar.min = output_min;
+    params.scalar.max = output_max;
   #endif
   return params;
 }
@@ -522,8 +522,8 @@
   float output_max)
 {
   union xnn_f32_spchw_params params;
-  params.scalar.max = output_max;
   params.scalar.min = output_min;
+  params.scalar.max = output_max;
   return params;
 }
 
@@ -536,12 +536,12 @@
   union xnn_u8_output_params params;
   #if XNN_ARCH_X86 || XNN_ARCH_X86_64
     for (uint32_t i = 0; i < 16; i++) {
-      params.sse2.max[i] = output_max;
       params.sse2.min[i] = output_min;
+      params.sse2.max[i] = output_max;
     }
   #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
-    params.neon.max = output_max;
     params.neon.min = output_min;
+    params.neon.max = output_max;
   #else
     params.scalar.min = (int32_t) (uint32_t) output_min;
     params.scalar.max = (int32_t) (uint32_t) output_max;
@@ -621,8 +621,8 @@
     }
     params.sse2.shift = shift;
     for (uint32_t i = 0; i < 16; i++) {
-      params.sse2.y_max[i] = output_max;
       params.sse2.y_min[i] = output_min;
+      params.sse2.y_max[i] = output_max;
     }
   #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
     params.neon.a_zero_point = a_zero_point;
@@ -631,8 +631,8 @@
     params.neon.a_multiplier = (int32_t) a_multiplier;
     params.neon.b_multiplier = (int32_t) b_multiplier;
     params.neon.right_shift = (int32_t) -shift;
-    params.neon.y_max = output_max;
     params.neon.y_min = output_min;
+    params.neon.y_max = output_max;
   #else
     const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
     const uint32_t remainder_threshold = remainder_mask >> 1;
@@ -644,8 +644,8 @@
     params.scalar.remainder_threshold = (int32_t) remainder_threshold;
     params.scalar.shift = shift;
     params.scalar.y_zero_point = (int32_t) (uint32_t) output_zero_point;
-    params.scalar.y_max = (int32_t) (uint32_t) output_max;
     params.scalar.y_min = (int32_t) (uint32_t) output_min;
+    params.scalar.y_max = (int32_t) (uint32_t) output_max;
   #endif
   return params;
 }
@@ -693,8 +693,8 @@
   params.scalar.remainder_threshold = (int32_t) remainder_threshold;
   params.scalar.shift = shift;
   params.scalar.y_zero_point = (int32_t) (uint32_t) output_zero_point;
-  params.scalar.y_max = (int32_t) (uint32_t) output_max;
   params.scalar.y_min = (int32_t) (uint32_t) output_min;
+  params.scalar.y_max = (int32_t) (uint32_t) output_max;
   return params;
 }
 
@@ -775,15 +775,15 @@
       params.sse2.zero_point[i] = (int16_t) (uint16_t) zero_point;
     }
     for (uint32_t i = 0; i < 16; i++) {
-      params.sse2.max[i] = max;
       params.sse2.min[i] = min;
+      params.sse2.max[i] = max;
     }
   #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
     params.neon.multiplier = multiplier;
     params.neon.right_shift = -shift;
     params.neon.zero_point = (int16_t) (uint16_t) zero_point;
-    params.neon.max = max;
     params.neon.min = min;
+    params.neon.max = max;
   #else
     const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
     const uint32_t remainder_threshold = remainder_mask >> 1;
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index dfd69cb..f3b5b7c 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -17,27 +17,27 @@
 
 struct xnn_f16_output_params {
   uint16_t scale;
-  uint16_t max;
   uint16_t min;
+  uint16_t max;
 };
 
 union xnn_f32_output_params {
   struct {
-    float max;
     float min;
+    float max;
   } scalar;
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
-    XNN_ALIGN(16) float max[4];
     XNN_ALIGN(16) float min[4];
+    XNN_ALIGN(16) float max[4];
   } sse;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };
 
 union xnn_f32_spchw_params {
   struct {
-    float max;
     float min;
+    float max;
   } scalar;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
@@ -50,8 +50,8 @@
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
-    XNN_ALIGN(16) float max[4];
     XNN_ALIGN(16) float min[4];
+    XNN_ALIGN(16) float max[4];
     XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels
     XNN_ALIGN(16) uint32_t mask_odd[4];  // used by stride 2 kernels
     XNN_ALIGN(16) uint32_t mask[4]; // used by stride 1 kernels
@@ -61,19 +61,19 @@
 
 union xnn_u8_output_params {
   struct {
-    int32_t max;
     int32_t min;
+    int32_t max;
   } scalar;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
-    uint8_t max;
     uint8_t min;
+    uint8_t max;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
-    XNN_ALIGN(16) uint8_t max[16];
     XNN_ALIGN(16) uint8_t min[16];
+    XNN_ALIGN(16) uint8_t max[16];
   } sse2;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };
@@ -87,15 +87,15 @@
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float multiplier[4];
-    XNN_ALIGN(16) float output_max[4];
     XNN_ALIGN(16) float output_min[4];
+    XNN_ALIGN(16) float output_max[4];
   } sse2;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     XNN_ALIGN(16) float multiplier;
-    XNN_ALIGN(16) float output_max;
     XNN_ALIGN(16) float output_min;
+    XNN_ALIGN(16) float output_max;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 };
@@ -109,16 +109,16 @@
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float multiplier[4];
-    XNN_ALIGN(16) float output_max[4];
     XNN_ALIGN(16) float output_min[4];
+    XNN_ALIGN(16) float output_max[4];
     XNN_ALIGN(16) uint32_t mask[4];
   } sse;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     XNN_ALIGN(16) float multiplier;
-    XNN_ALIGN(16) float output_max;
     XNN_ALIGN(16) float output_min;
+    XNN_ALIGN(16) float output_max;
     XNN_ALIGN(16) uint32_t mask[4];
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64 */
@@ -158,8 +158,8 @@
     int32_t multiplier;
     int32_t right_shift;
     int16_t output_zero_point;
-    uint8_t output_max;
     uint8_t output_min;
+    uint8_t output_max;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -172,8 +172,8 @@
     XNN_ALIGN(16) int32_t remainder_threshold[4];
     XNN_ALIGN(16) uint64_t shift[2];
     XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) uint8_t output_max[16];
     XNN_ALIGN(16) uint8_t output_min[16];
+    XNN_ALIGN(16) uint8_t output_max[16];
   } sse2;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };
@@ -187,8 +187,8 @@
     int32_t remainder_mask;
     int32_t remainder_threshold;
     int32_t y_zero_point;
-    int32_t y_max;
     int32_t y_min;
+    int32_t y_max;
   } scalar;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
@@ -198,8 +198,8 @@
     int32_t a_multiplier;
     int32_t b_multiplier;
     int32_t right_shift;
-    uint8_t y_max;
     uint8_t y_min;
+    uint8_t y_max;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -212,8 +212,8 @@
     XNN_ALIGN(16) int32_t remainder_mask[4];
     XNN_ALIGN(16) int32_t remainder_threshold[4];
     XNN_ALIGN(16) int16_t y_zero_point[8];
-    XNN_ALIGN(16) uint8_t y_max[16];
     XNN_ALIGN(16) uint8_t y_min[16];
+    XNN_ALIGN(16) uint8_t y_max[16];
     uint32_t shift;
     uint32_t a_multiplier;
     uint32_t b_multiplier;
@@ -237,8 +237,8 @@
     int32_t multiplier;
     int64_t left_shift;
     int16_t output_zero_point;
-    uint8_t output_max;
     uint8_t output_min;
+    uint8_t output_max;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -248,8 +248,8 @@
     XNN_ALIGN(16) uint64_t rounding[2];
     XNN_ALIGN(16) uint64_t right_shift[2];
     XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) uint8_t output_max[16];
     XNN_ALIGN(16) uint8_t output_min[16];
+    XNN_ALIGN(16) uint8_t output_max[16];
   } sse2;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };
@@ -264,22 +264,22 @@
   } scalar;
   struct {
     float scale;
-    float max;
     float min;
+    float max;
     float magic;
     int32_t magic_less_zero_point;
   } neon;
   struct {
     float scale;
     int16_t zero_point;
-    uint8_t max;
     uint8_t min;
+    uint8_t max;
   } neonv8;
   struct {
     XNN_ALIGN(16) float scale[4];
     XNN_ALIGN(16) int16_t zero_point[8];
-    XNN_ALIGN(16) uint8_t max[16];
     XNN_ALIGN(16) uint8_t min[16];
+    XNN_ALIGN(16) uint8_t max[16];
   } sse2;
   struct {
     XNN_ALIGN(16) float scale[4];
@@ -304,16 +304,16 @@
     int32_t multiplier;
     int32_t right_shift;
     int16_t zero_point;
-    uint8_t max;
     uint8_t min;
+    uint8_t max;
   } neon;
   struct {
     XNN_ALIGN(16) uint32_t multiplier[4];
     XNN_ALIGN(16) uint64_t rounding[2];
     XNN_ALIGN(16) uint32_t shift[4];
     XNN_ALIGN(16) int16_t zero_point[8];
-    XNN_ALIGN(16) uint8_t max[16];
     XNN_ALIGN(16) uint8_t min[16];
+    XNN_ALIGN(16) uint8_t max[16];
   } sse2;
 };
 
@@ -332,8 +332,8 @@
     int32_t multiplier;
     int32_t right_shift;
     int16_t zero_point;
-    uint8_t max;
     uint8_t min;
+    uint8_t max;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -344,8 +344,8 @@
     XNN_ALIGN(16) int32_t remainder_threshold[4];
     XNN_ALIGN(16) uint64_t shift[2];
     XNN_ALIGN(16) int16_t zero_point[8];
-    XNN_ALIGN(16) uint8_t max[16];
     XNN_ALIGN(16) uint8_t min[16];
+    XNN_ALIGN(16) uint8_t max[16];
   } sse2;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };