Switch QS8/QU8 VMUL[C] NEON microkernels to RNDNU requantization

Performance on Ulefone Note 8 (Cortex-A7):
- QS8 VMUL: 529 MB/s -> 716 MB/s (+35%)
- QS8 VMULC: 384 MB/s -> 535 MB/s (+39%)

PiperOrigin-RevId: 421490747
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8dd5d3c..424de58 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1407,8 +1407,8 @@
   src/qs8-vadd/gen/minmax-neon-ld64-x32.c
   src/qs8-vaddc/gen/minmax-neon-ld64-x16.c
   src/qs8-vaddc/gen/minmax-neon-ld64-x32.c
-  src/qs8-vmul/gen/minmax-fp32-neon-ld64-x16.c
-  src/qs8-vmulc/gen/minmax-fp32-neon-ld64-x16.c
+  src/qs8-vmul/gen/minmax-rndnu-neon-ld64-x16.c
+  src/qs8-vmulc/gen/minmax-rndnu-neon-ld64-x16.c
   src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c
   src/qu8-gavgpool/gen/7x-minmax-fp32-neon-c8.c
   src/qu8-dwconv/gen/up8x25-minmax-rndnu-neon-mul8.c
@@ -1428,8 +1428,8 @@
   src/qu8-vadd/gen/minmax-neon-ld64-x32.c
   src/qu8-vaddc/gen/minmax-neon-ld64-x16.c
   src/qu8-vaddc/gen/minmax-neon-ld64-x32.c
-  src/qu8-vmul/gen/minmax-fp32-neon-ld64-x16.c
-  src/qu8-vmulc/gen/minmax-fp32-neon-ld64-x16.c
+  src/qu8-vmul/gen/minmax-rndnu-neon-ld64-x16.c
+  src/qu8-vmulc/gen/minmax-rndnu-neon-ld64-x16.c
   src/s8-ibilinear/gen/neon-c8.c
   src/s8-ibilinear/gen/neon-c16.c
   src/s8-maxpool/9p8x-minmax-neon-c16.c
@@ -2347,9 +2347,15 @@
   src/qs8-vmul/gen/minmax-fp32-neon-ld64-x8.c
   src/qs8-vmul/gen/minmax-fp32-neon-ld64-x16.c
   src/qs8-vmul/gen/minmax-fp32-neon-ld128-x16.c
+  src/qs8-vmul/gen/minmax-rndnu-neon-ld64-x8.c
+  src/qs8-vmul/gen/minmax-rndnu-neon-ld64-x16.c
+  src/qs8-vmul/gen/minmax-rndnu-neon-ld128-x16.c
   src/qs8-vmulc/gen/minmax-fp32-neon-ld64-x8.c
   src/qs8-vmulc/gen/minmax-fp32-neon-ld64-x16.c
   src/qs8-vmulc/gen/minmax-fp32-neon-ld128-x16.c
+  src/qs8-vmulc/gen/minmax-rndnu-neon-ld64-x8.c
+  src/qs8-vmulc/gen/minmax-rndnu-neon-ld64-x16.c
+  src/qs8-vmulc/gen/minmax-rndnu-neon-ld128-x16.c
   src/qu8-avgpool/9p8x-minmax-neon-c8.c
   src/qu8-avgpool/9x-minmax-neon-c8.c
   src/qu8-dwconv/gen/up8x9-minmax-fp32-neon-mul16.c
@@ -2430,9 +2436,15 @@
   src/qu8-vmul/gen/minmax-fp32-neon-ld64-x8.c
   src/qu8-vmul/gen/minmax-fp32-neon-ld64-x16.c
   src/qu8-vmul/gen/minmax-fp32-neon-ld128-x16.c
+  src/qu8-vmul/gen/minmax-rndnu-neon-ld64-x8.c
+  src/qu8-vmul/gen/minmax-rndnu-neon-ld64-x16.c
+  src/qu8-vmul/gen/minmax-rndnu-neon-ld128-x16.c
   src/qu8-vmulc/gen/minmax-fp32-neon-ld64-x8.c
   src/qu8-vmulc/gen/minmax-fp32-neon-ld64-x16.c
   src/qu8-vmulc/gen/minmax-fp32-neon-ld128-x16.c
+  src/qu8-vmulc/gen/minmax-rndnu-neon-ld64-x8.c
+  src/qu8-vmulc/gen/minmax-rndnu-neon-ld64-x16.c
+  src/qu8-vmulc/gen/minmax-rndnu-neon-ld128-x16.c
   src/s8-ibilinear/gen/neon-c8.c
   src/s8-ibilinear/gen/neon-c16.c
   src/s8-maxpool/9p8x-minmax-neon-c16.c
@@ -2871,11 +2883,7 @@
   src/qc8-igemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c
   src/qc8-igemm/gen/2x8c2s4-minmax-fp32-neonv8-mlal.c
   src/qc8-igemm/gen/2x8c8-minmax-fp32-neonv8-mlal.c
-  src/qc8-igemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c
-  src/qs8-vmul/gen/minmax-fp32-neonv8-ld64-x16.c
-  src/qs8-vmulc/gen/minmax-fp32-neonv8-ld64-x16.c
-  src/qu8-vmul/gen/minmax-fp32-neonv8-ld64-x16.c
-  src/qu8-vmulc/gen/minmax-fp32-neonv8-ld64-x16.c)
+  src/qc8-igemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c)
 
 SET(ALL_NEONV8_MICROKERNEL_SRCS
   src/f32-qs8-vcvt/gen/vcvt-neonv8-x8.c
@@ -7899,6 +7907,15 @@
   TARGET_LINK_LIBRARIES(qu8-vmul-minmax-fp32-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
   ADD_TEST(qu8-vmul-minmax-fp32-test qu8-vmul-minmax-fp32-test)
 
+  ADD_EXECUTABLE(qu8-vmul-minmax-rndnu-test test/qu8-vmul-minmax-rndnu.cc $<TARGET_OBJECTS:all_microkernels>)
+  SET_TARGET_PROPERTIES(qu8-vmul-minmax-rndnu-test PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS YES)
+  TARGET_INCLUDE_DIRECTORIES(qu8-vmul-minmax-rndnu-test PRIVATE include src test)
+  TARGET_LINK_LIBRARIES(qu8-vmul-minmax-rndnu-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
+  ADD_TEST(qu8-vmul-minmax-rndnu-test qu8-vmul-minmax-rndnu-test)
+
   ADD_EXECUTABLE(qu8-vmulc-minmax-fp32-test test/qu8-vmulc-minmax-fp32.cc $<TARGET_OBJECTS:all_microkernels>)
   SET_TARGET_PROPERTIES(qu8-vmulc-minmax-fp32-test PROPERTIES
     CXX_STANDARD 11
@@ -7908,6 +7925,15 @@
   TARGET_LINK_LIBRARIES(qu8-vmulc-minmax-fp32-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
   ADD_TEST(qu8-vmulc-minmax-fp32-test qu8-vmulc-minmax-fp32-test)
 
+  ADD_EXECUTABLE(qu8-vmulc-minmax-rndnu-test test/qu8-vmulc-minmax-rndnu.cc $<TARGET_OBJECTS:all_microkernels>)
+  SET_TARGET_PROPERTIES(qu8-vmulc-minmax-rndnu-test PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS YES)
+  TARGET_INCLUDE_DIRECTORIES(qu8-vmulc-minmax-rndnu-test PRIVATE include src test)
+  TARGET_LINK_LIBRARIES(qu8-vmulc-minmax-rndnu-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
+  ADD_TEST(qu8-vmulc-minmax-rndnu-test qu8-vmulc-minmax-rndnu-test)
+
   ADD_EXECUTABLE(s8-ibilinear-test test/s8-ibilinear.cc $<TARGET_OBJECTS:all_microkernels>)
   SET_TARGET_PROPERTIES(s8-ibilinear-test PROPERTIES
     CXX_STANDARD 11