QC8 GEMM/IGEMM assembly microkernels for ARMv7 NEON

- FP32 Quantization for ARMv7 used a float add with bias instead of
convert from float to int, and subtract for zero point includes bias,
replaces add for output zero point.
- params has 2 addition value: magic_bias and magic_bias_less_output_zero_point
- params output_zero_point is removed for ARMv7, so total params structure is 10 bytes using 2 loads.

PiperOrigin-RevId: 421896610
diff --git a/BUILD.bazel b/BUILD.bazel
index 4bcf5c5..c30e985 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -7017,6 +7017,10 @@
     "src/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S",
     "src/f32-igemm/gen/4x8-minmax-aarch32-neon-ld64.S",
     "src/f32-igemm/gen/4x8-minmax-aarch32-neon-prfm-cortex-a75.S",
+    "src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-ld64.S",
+    "src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-ld64.S",
+    "src/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-ld64.S",
+    "src/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-ld64.S",
     "src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-ld64.S",
     "src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-ld64.S",
     "src/qc8-gemm/gen/4x8c4-minmax-fp32-aarch32-neondot-ld64.S",
@@ -11605,8 +11609,8 @@
     name = "qc8_gemm_minmax_fp32_test",
     timeout = "moderate",
     srcs = [
-        "test/qc8-gemm-minmax-fp32.cc",
         "test/qc8-gemm-minmax-fp32-2.cc",
+        "test/qc8-gemm-minmax-fp32.cc",
         "test/qc8-gemm-minmax-fp32-3.cc",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
     shard_count = 10,