Work around generating v128.storeXX_lane for quantized WAsm SIMD microkernels

Slightly refactor quantized microkernels to trick Clang to avoid generating
v128.store8_lane, v128.store16_lane, and v128.store32_lane instructions which
require Chrome M90+.

PiperOrigin-RevId: 391713996
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld64.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld64.c
index 895f8b7..7672eff 100644
--- a/src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld64.c
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld64.c
@@ -127,13 +127,14 @@
 
       nc -= 4;
     } else {
+      uint32_t vout0 = wasm_i32x4_extract_lane(vout, 0);
       if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) wasm_i16x8_extract_lane(vout, 0);
+        *((uint16_t*) c0) = (uint16_t) vout0;
+        vout0 >>= 16;
         c0 += 2;
-        vout = wasm_u32x4_shr(vout, 16);
       }
       if (nc & 1) {
-        *c0 = (int8_t) wasm_i8x16_extract_lane(vout, 0);
+        *c0 = (int8_t) vout0;
       }
 
       nc = 0;