Neon IGEMM do remainder with reversed MR for shifts

- order of shifts should match order of stores for best performance and consistency

PiperOrigin-RevId: 395521320
diff --git a/src/qc8-igemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c b/src/qc8-igemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c
index fd2ad4d..4cb084c 100644
--- a/src/qc8-igemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c
+++ b/src/qc8-igemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c
@@ -601,8 +601,8 @@
         vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
         vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
         vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
-        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
         vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
       }
       if (nc & 1) {
         vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);