Automated g4 rollback of changelist 274728310.
*** Reason for rollback ***
Causes faces to not detect consistently b/142777038
*** Original change description ***
Neon intrinsics clamping - Replace 2 LD1R with 1 LD2R

PiperOrigin-RevId: 275943558
diff --git a/src/f32-gemm/neon-ld64.c.in b/src/f32-gemm/neon-ld64.c.in
index f275a35..5a89619 100644
--- a/src/f32-gemm/neon-ld64.c.in
+++ b/src/f32-gemm/neon-ld64.c.in
@@ -110,14 +110,15 @@
           $else:
             vacc${M}x${ABC[N:N+4]} = vmlaq_f32(vacc${M}x${ABC[N:N+4]}, va${M},   vb${ABC[N:N+4]});
     }
-    const float32x4x2_t voutput_clamp = vld2q_dup_f32(&params->scalar.max);
+    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     $for N in range(0, NR, 4):
       $for M in range(MR):
-        vacc${M}x${ABC[N:N+4]} = vminq_f32(vacc${M}x${ABC[N:N+4]}, voutput_clamp.val[0]);
+        vacc${M}x${ABC[N:N+4]} = vminq_f32(vacc${M}x${ABC[N:N+4]}, vmax);
 
+    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
     $for N in range(0, NR, 4):
       $for M in range(MR):
-        vacc${M}x${ABC[N:N+4]} = vmaxq_f32(vacc${M}x${ABC[N:N+4]}, voutput_clamp.val[1]);
+        vacc${M}x${ABC[N:N+4]} = vmaxq_f32(vacc${M}x${ABC[N:N+4]}, vmin);
 
     if XNN_LIKELY(nc >= ${NR}) {
       $for M in reversed(range(MR)):