Refactor requantization in scalar QS8/QC8/QU8 microkernels

- Rename MAGIC variant to FMAGIC (floating-point min/max + magic bias) and
LRINT variant to LRINTF
- Avoid undefined behaviour in LRINT-variant microkernels
- Remove scalar microkernels with RNDNU requantization as they don't properly
handle requantization scale greater than 1.0

PiperOrigin-RevId: 419542667
diff --git a/bench/qs8-dwconv-e2e.cc b/bench/qs8-dwconv-e2e.cc
index ea2cedb..89296fc 100644
--- a/bench/qs8-dwconv-e2e.cc
+++ b/bench/qs8-dwconv-e2e.cc
@@ -411,50 +411,50 @@
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 
 
-static void qs8_dwconv_up1x9__scalar_lrint(benchmark::State& state, models::ExecutionPlanFactory model) {
+static void qs8_dwconv_up1x9__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
   DWConvEnd2EndBenchmark(state, model,
-    xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_lrint,
-    xnn_init_qs8_conv_minmax_fp32_scalar_lrint_params,
+    xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_lrintf,
+    xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
     1 /* channel tile */, 9 /* primary tile */);
 }
-static void qs8_dwconv_up2x9__scalar_lrint(benchmark::State& state, models::ExecutionPlanFactory model) {
+static void qs8_dwconv_up2x9__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
   DWConvEnd2EndBenchmark(state, model,
-    xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrint,
-    xnn_init_qs8_conv_minmax_fp32_scalar_lrint_params,
+    xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf,
+    xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
     2 /* channel tile */, 9 /* primary tile */);
 }
-static void qs8_dwconv_up4x9__scalar_lrint(benchmark::State& state, models::ExecutionPlanFactory model) {
+static void qs8_dwconv_up4x9__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
   DWConvEnd2EndBenchmark(state, model,
-    xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_lrint,
-    xnn_init_qs8_conv_minmax_fp32_scalar_lrint_params,
+    xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_lrintf,
+    xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
     4 /* channel tile */, 9 /* primary tile */);
 }
-static void qs8_dwconv_up1x9__scalar_magic(benchmark::State& state, models::ExecutionPlanFactory model) {
+static void qs8_dwconv_up1x9__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
   DWConvEnd2EndBenchmark(state, model,
-    xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_magic,
-    xnn_init_qs8_conv_minmax_fp32_scalar_magic_params,
+    xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic,
+    xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
     1 /* channel tile */, 9 /* primary tile */);
 }
-static void qs8_dwconv_up2x9__scalar_magic(benchmark::State& state, models::ExecutionPlanFactory model) {
+static void qs8_dwconv_up2x9__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
   DWConvEnd2EndBenchmark(state, model,
-    xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_magic,
-    xnn_init_qs8_conv_minmax_fp32_scalar_magic_params,
+    xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_fmagic,
+    xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
     2 /* channel tile */, 9 /* primary tile */);
 }
-static void qs8_dwconv_up4x9__scalar_magic(benchmark::State& state, models::ExecutionPlanFactory model) {
+static void qs8_dwconv_up4x9__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
   DWConvEnd2EndBenchmark(state, model,
-    xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_magic,
-    xnn_init_qs8_conv_minmax_fp32_scalar_magic_params,
+    xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_fmagic,
+    xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
     4 /* channel tile */, 9 /* primary tile */);
 }
 
-BENCHMARK_QS8_END2END(qs8_dwconv_up1x9__scalar_lrint);
-BENCHMARK_QS8_END2END(qs8_dwconv_up2x9__scalar_lrint);
-BENCHMARK_QS8_END2END(qs8_dwconv_up4x9__scalar_lrint);
+BENCHMARK_QS8_END2END(qs8_dwconv_up1x9__scalar_lrintf);
+BENCHMARK_QS8_END2END(qs8_dwconv_up2x9__scalar_lrintf);
+BENCHMARK_QS8_END2END(qs8_dwconv_up4x9__scalar_lrintf);
 
-BENCHMARK_QS8_END2END(qs8_dwconv_up1x9__scalar_magic);
-BENCHMARK_QS8_END2END(qs8_dwconv_up2x9__scalar_magic);
-BENCHMARK_QS8_END2END(qs8_dwconv_up4x9__scalar_magic);
+BENCHMARK_QS8_END2END(qs8_dwconv_up1x9__scalar_fmagic);
+BENCHMARK_QS8_END2END(qs8_dwconv_up2x9__scalar_fmagic);
+BENCHMARK_QS8_END2END(qs8_dwconv_up4x9__scalar_fmagic);
 
 
 #ifndef XNNPACK_BENCHMARK_NO_MAIN