Scalar RAddStoreExpMinusMax micro-kernels

- Building blocks for SoftArgMax operator on WAsm
- P5 and LUT64+P2 implementations
- scalar_p5_x4_acc2 version is the fastest on both ARM64 and x86-64

PiperOrigin-RevId: 290780293
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 944f4a6..a404ae5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -222,6 +222,18 @@
   src/f32-ppmm/gen/4x4-scalar.c
   src/f32-prelu/gen/scalar-2x1.c
   src/f32-prelu/gen/scalar-2x4.c
+  src/f32-raddstoreexpminusmax/gen/scalar-lut64-p2-x1.c
+  src/f32-raddstoreexpminusmax/gen/scalar-lut64-p2-x2.c
+  src/f32-raddstoreexpminusmax/gen/scalar-lut64-p2-x2-acc2.c
+  src/f32-raddstoreexpminusmax/gen/scalar-lut64-p2-x4.c
+  src/f32-raddstoreexpminusmax/gen/scalar-lut64-p2-x4-acc2.c
+  src/f32-raddstoreexpminusmax/gen/scalar-lut64-p2-x4-acc4.c
+  src/f32-raddstoreexpminusmax/gen/scalar-p5-x1.c
+  src/f32-raddstoreexpminusmax/gen/scalar-p5-x2.c
+  src/f32-raddstoreexpminusmax/gen/scalar-p5-x2-acc2.c
+  src/f32-raddstoreexpminusmax/gen/scalar-p5-x4.c
+  src/f32-raddstoreexpminusmax/gen/scalar-p5-x4-acc2.c
+  src/f32-raddstoreexpminusmax/gen/scalar-p5-x4-acc4.c
   src/f32-rmax/scalar.c
   src/f32-sigmoid/gen/scalar-lut2048-p1-div-x1.c
   src/f32-sigmoid/gen/scalar-lut2048-p1-div-x2.c