SoftArgMax operator

SoftArgMax operator implementation using Three-Pass algorithm with reloading of
computed exponentials.

PiperOrigin-RevId: 291778753
diff --git a/src/init.c b/src/init.c
index 2f9c728..2f7b608 100644
--- a/src/init.c
+++ b/src/init.c
@@ -37,6 +37,7 @@
 #include <xnnpack/params.h>
 #include <xnnpack/pavgpool.h>
 #include <xnnpack/prelu.h>
+#include <xnnpack/raddstoreexpminusmax.h>
 #include <xnnpack/rmax.h>
 #include <xnnpack/spmm.h>
 #include <xnnpack/unpool.h>
@@ -248,6 +249,8 @@
       .row_tile = 2,
       .channel_tile = 8,
     };
+    xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x8;
+    xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
     xnn_params.f32.vadd = (struct vbinary_parameters) {
       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__neon_x8,
       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__neon_x8,
@@ -552,6 +555,8 @@
       .row_tile = 2,
       .channel_tile = 8,
     };
+    xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x16;
+    xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
     xnn_params.f32.vadd = (struct vbinary_parameters) {
       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__neon_x8,
       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__neon_x8,
@@ -911,6 +916,8 @@
       .row_tile = 2,
       .channel_tile = 8,
     };
+    xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc2;
+    xnn_params.f32.rmax = xnn_f32_rmax_ukernel__sse;
     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
       xnn_params.f32.vadd = (struct vbinary_parameters) {
         .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__avx512f_x32,
@@ -1218,6 +1225,8 @@
       .row_tile = 2,
       .channel_tile = 8,
     };
+    xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc2;
+    xnn_params.f32.rmax = xnn_f32_rmax_ukernel__psimd;
     xnn_params.f32.vadd = (struct vbinary_parameters) {
       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__psimd_x8,
       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__psimd_x8,
@@ -1424,6 +1433,8 @@
       .row_tile = 4,
       .channel_tile = 4,
     };
+    xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc2;
+    xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
     xnn_params.f32.vadd = (struct vbinary_parameters) {
       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasm_x4,
       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasm_x4,