Work around msan failured in AVX micro-kernels

Avoid using VMASKMOVPS to store values in loop epilog: MemorySanitizer doesn't
seem to recognize initialization via VMASKMOVPS.

PiperOrigin-RevId: 293517405
diff --git a/src/f32-dwconv/gen/up16x25-fma3.c b/src/f32-dwconv/gen/up16x25-fma3.c
index 1200afc..356512f 100644
--- a/src/f32-dwconv/gen/up16x25-fma3.c
+++ b/src/f32-dwconv/gen/up16x25-fma3.c
@@ -601,8 +601,22 @@
       __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
       vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
 
-      _mm256_maskstore_ps(output, vmask, vacc01234567);
-      output += c;
+      // _mm256_maskstore_ps(output, vmask, vacc01234567); output += c; could be used here, but triggers msan failures (probably an msan bug).
+      __m128 vacc0123 = _mm256_castps256_ps128(vacc01234567);
+      if (c & 4) {
+        _mm_storeu_ps(output, vacc0123);
+        vacc0123 = _mm256_extractf128_ps(vacc01234567, 1);
+        output += 4;
+      }
+      if (c & 2) {
+        _mm_storel_pi((__m64*) output, vacc0123);
+        vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
+        output += 2;
+      }
+      if (c & 1) {
+        _mm_store_ss(output, vacc0123);
+        output += 1;
+      }
     }
 
     output = (float*) ((uintptr_t) output + output_increment);