Work around msan failured in AVX micro-kernels

Avoid using VMASKMOVPS to store values in loop epilog: MemorySanitizer doesn't
seem to recognize initialization via VMASKMOVPS.

PiperOrigin-RevId: 293517405
diff --git a/src/f32-vbinary/gen/vadd-avx-x8.c b/src/f32-vbinary/gen/vadd-avx-x8.c
index 14cced7..38208f5 100644
--- a/src/f32-vbinary/gen/vadd-avx-x8.c
+++ b/src/f32-vbinary/gen/vadd-avx-x8.c
@@ -70,6 +70,21 @@
     __m256 vy = _mm256_add_ps(va, vb);
     vy = _mm256_max_ps(vy, vy_min);
     vy = _mm256_min_ps(vy, vy_max);
-    _mm256_maskstore_ps(y, vmask, vy);
+
+    // _mm256_maskstore_ps(y, vmask, vy) could be used here, but triggers msan failures (probably an msan bug).
+    __m128 vy_lo = _mm256_castps256_ps128(vy);
+    if (n & (4 * sizeof(float))) {
+      _mm_storeu_ps(y, vy_lo);
+      vy_lo = _mm256_extractf128_ps(vy, 1);
+      y += 4;
+    }
+    if (n & (2 * sizeof(float))) {
+      _mm_storel_pi((__m64*) y, vy_lo);
+      vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      _mm_store_ss(y, vy_lo);
+    }
   }
 }