Refactor and open-source Three-Pass Softmax micro-kernels

- RAddExpMinusMax micro-kernel (AVX2 and AVX512F)
- RAddStoreExpMinusMax micro-kernel (AVX2 and AVX512F)
- VScaleExpMinusMax micro-kernel (AVX2 and AVX512F)
- Unit tests for all micro-kernels

PiperOrigin-RevId: 275570264
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 480380a..0a631cd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -486,13 +486,19 @@
   src/f32-rmax/avx.c)
 
 SET(XNNPACK_AVX2_MICROKERNEL_SRCS
+  src/f32-raddexpminusmax/avx2-p5-unroll64.c
+  src/f32-raddstoreexpminusmax/avx2-p5-unroll64.c
+  src/f32-vscaleexpminusmax/avx2-p5-unroll64.c
   src/math/exp-avx2-p5.c
   src/math/exp-avx2-perm-p3.c
   src/math/exp-avx2-perm-p4.c
   src/math/expminus-avx2-p5.c)
 
 SET(XNNPACK_AVX512F_MICROKERNEL_SRCS
+  src/f32-raddexpminusmax/avx512f-p5-scalef-unroll128.c
+  src/f32-raddstoreexpminusmax/avx512f-p5-scalef-unroll128.c
   src/f32-rmax/avx512f.c
+  src/f32-vscaleexpminusmax/avx512f-p5-scalef-unroll128.c
   src/math/exp-avx512f-p5-scalef.c
   src/math/exp-avx512f-p5.c
   src/math/exp-avx512f-perm-p3.c)
@@ -1030,6 +1036,24 @@
   TARGET_LINK_LIBRARIES(f32-prelu-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
   ADD_TEST(f32-prelu-test f32-prelu-test)
 
+  ADD_EXECUTABLE(f32-raddexpminusmax-test test/f32-raddexpminusmax.cc)
+  SET_TARGET_PROPERTIES(f32-raddexpminusmax-test PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS YES)
+  TARGET_INCLUDE_DIRECTORIES(f32-raddexpminusmax-test PRIVATE src test)
+  TARGET_LINK_LIBRARIES(f32-raddexpminusmax-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
+  ADD_TEST(f32-raddexpminusmax-test f32-raddexpminusmax-test)
+
+  ADD_EXECUTABLE(f32-raddstoreexpminusmax-test test/f32-raddstoreexpminusmax.cc)
+  SET_TARGET_PROPERTIES(f32-raddstoreexpminusmax-test PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS YES)
+  TARGET_INCLUDE_DIRECTORIES(f32-raddstoreexpminusmax-test PRIVATE src test)
+  TARGET_LINK_LIBRARIES(f32-raddstoreexpminusmax-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
+  ADD_TEST(f32-raddstoreexpminusmax-test f32-raddstoreexpminusmax-test)
+
   ADD_EXECUTABLE(f32-rmax-test test/f32-rmax.cc)
   SET_TARGET_PROPERTIES(f32-rmax-test PROPERTIES
     CXX_STANDARD 11
@@ -1075,6 +1099,15 @@
   TARGET_LINK_LIBRARIES(f32-vmulcaddc-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
   ADD_TEST(f32-vmulcaddc-test f32-vmulcaddc-test)
 
+  ADD_EXECUTABLE(f32-vscaleexpminusmax-test test/f32-vscaleexpminusmax.cc)
+  SET_TARGET_PROPERTIES(f32-vscaleexpminusmax-test PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS YES)
+  TARGET_INCLUDE_DIRECTORIES(f32-vscaleexpminusmax-test PRIVATE src test)
+  TARGET_LINK_LIBRARIES(f32-vscaleexpminusmax-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
+  ADD_TEST(f32-vscaleexpminusmax-test f32-vscaleexpminusmax-test)
+
   ADD_EXECUTABLE(f32-vsub-test test/f32-vsub.cc)
   SET_TARGET_PROPERTIES(f32-vsub-test PROPERTIES
     CXX_STANDARD 11