Vector SQRT microkernels

PiperOrigin-RevId: 319054903
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7e330f4..c62c71b 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -414,6 +414,9 @@
   src/f32-vrnd/gen/vrndd-scalar-libm-x1.c
   src/f32-vrnd/gen/vrndd-scalar-libm-x2.c
   src/f32-vrnd/gen/vrndd-scalar-libm-x4.c
+  src/f32-vsqrt/gen/scalar-sqrt-x1.c
+  src/f32-vsqrt/gen/scalar-sqrt-x2.c
+  src/f32-vsqrt/gen/scalar-sqrt-x4.c
   src/f32-vunary/gen/vabs-scalar-x1.c
   src/f32-vunary/gen/vabs-scalar-x2.c
   src/f32-vunary/gen/vabs-scalar-x4.c
@@ -979,6 +982,26 @@
   src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr2recps-x24.c
   src/f32-vmulcaddc/gen/c4-minmax-neonfma-2x.c
   src/f32-vmulcaddc/gen/c8-minmax-neonfma-2x.c
+  src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x4.c
+  src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x8.c
+  src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x12.c
+  src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x16.c
+  src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x20.c
+  src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x24.c
+  src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x28.c
+  src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x32.c
+  src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x36.c
+  src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x40.c
+  src/f32-vsqrt/gen/neonfma-nr2fma1adj-x4.c
+  src/f32-vsqrt/gen/neonfma-nr2fma1adj-x8.c
+  src/f32-vsqrt/gen/neonfma-nr2fma1adj-x12.c
+  src/f32-vsqrt/gen/neonfma-nr2fma1adj-x16.c
+  src/f32-vsqrt/gen/neonfma-nr2fma1adj-x20.c
+  src/f32-vsqrt/gen/neonfma-nr2fma1adj-x24.c
+  src/f32-vsqrt/gen/neonfma-nr2fma1adj-x28.c
+  src/f32-vsqrt/gen/neonfma-nr2fma1adj-x32.c
+  src/f32-vsqrt/gen/neonfma-nr2fma1adj-x36.c
+  src/f32-vsqrt/gen/neonfma-nr2fma1adj-x40.c
   src/math/exp-neonfma-lut64-p2.c
   src/math/exp-neonfma-p5.c
   src/math/expminus-neonfma-lut2048-p1.c
@@ -1098,6 +1121,8 @@
   src/f32-spmm/gen/8x1-minmax-neonfma.c
   src/f32-spmm/gen/8x2-minmax-neonfma.c
   src/f32-spmm/gen/8x4-minmax-neonfma.c
+  src/f32-vsqrt/gen/neon-sqrt-x4.c
+  src/f32-vsqrt/gen/neon-sqrt-x8.c
   src/math/sigmoid-neonfma-rr1-lut2048-p1-div.c
   src/math/sigmoid-neonfma-rr1-lut64-p2-div.c
   src/math/sigmoid-neonfma-rr1-p5-div.c
@@ -1283,6 +1308,8 @@
   src/f32-vbinary/gen/vsubc-minmax-sse-x8.c
   src/f32-vmulcaddc/gen/c4-minmax-sse-2x.c
   src/f32-vmulcaddc/gen/c8-minmax-sse-2x.c
+  src/f32-vsqrt/gen/sse-sqrt-x4.c
+  src/f32-vsqrt/gen/sse-sqrt-x8.c
   src/f32-vunary/gen/vabs-sse-x4.c
   src/f32-vunary/gen/vabs-sse-x8.c
   src/f32-vunary/gen/vneg-sse-x4.c
@@ -1489,6 +1516,8 @@
   src/f32-vrnd/gen/vrndd-avx-x8.c
   src/f32-vrnd/gen/vrndd-avx-x16.c
   src/f32-vscale/avx-unroll32.c
+  src/f32-vsqrt/gen/avx-sqrt-x8.c
+  src/f32-vsqrt/gen/avx-sqrt-x16.c
   src/f32-vunary/gen/vabs-avx-x8.c
   src/f32-vunary/gen/vabs-avx-x16.c
   src/f32-vunary/gen/vneg-avx-x8.c
@@ -1553,6 +1582,14 @@
   src/f32-igemm/gen/3x16s4-minmax-fma3-broadcast.c
   src/f32-igemm/gen/4x16s4-minmax-fma3-broadcast.c
   src/f32-igemm/gen/5x16s4-minmax-fma3-broadcast.c
+  src/f32-vsqrt/gen/fma3-nr1fma1adj-x8.c
+  src/f32-vsqrt/gen/fma3-nr1fma1adj-x16.c
+  src/f32-vsqrt/gen/fma3-nr1fma1adj-x24.c
+  src/f32-vsqrt/gen/fma3-nr1fma1adj-x32.c
+  src/f32-vsqrt/gen/fma3-nr1fma1adj-x40.c
+  src/f32-vsqrt/gen/fma3-nr1fma1adj-x48.c
+  src/f32-vsqrt/gen/fma3-nr1fma1adj-x56.c
+  src/f32-vsqrt/gen/fma3-nr1fma1adj-x64.c
   src/math/sqrt-fma3-nr1fma.c
   src/math/sqrt-fma3-nr2fma.c
   src/math/sqrt-fma3-nr1fma1adj.c)
@@ -1801,6 +1838,14 @@
   src/f32-vscaleextexp/gen/avx512f-p5-scalef-x160.c
   src/f32-vscaleextexp/gen/avx512f-p5-scalef-x176.c
   src/f32-vscaleextexp/gen/avx512f-p5-scalef-x192.c
+  src/f32-vsqrt/gen/avx512f-nr1fma1adj-x16.c
+  src/f32-vsqrt/gen/avx512f-nr1fma1adj-x32.c
+  src/f32-vsqrt/gen/avx512f-nr1fma1adj-x48.c
+  src/f32-vsqrt/gen/avx512f-nr1fma1adj-x64.c
+  src/f32-vsqrt/gen/avx512f-nr1fma1adj-x80.c
+  src/f32-vsqrt/gen/avx512f-nr1fma1adj-x96.c
+  src/f32-vsqrt/gen/avx512f-nr1fma1adj-x112.c
+  src/f32-vsqrt/gen/avx512f-nr1fma1adj-x128.c
   src/f32-vunary/gen/vabs-avx512f-x16.c
   src/f32-vunary/gen/vabs-avx512f-x32.c
   src/f32-vunary/gen/vneg-avx512f-x16.c
@@ -3169,6 +3214,15 @@
   TARGET_LINK_LIBRARIES(f32-vsqrdiffc-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
   ADD_TEST(f32-vsqrdiffc-test f32-vsqrdiffc-test)
 
+  ADD_EXECUTABLE(f32-vsqrt-test test/f32-vsqrt.cc)
+  SET_TARGET_PROPERTIES(f32-vsqrt-test PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS YES)
+  TARGET_INCLUDE_DIRECTORIES(f32-vsqrt-test PRIVATE src test)
+  TARGET_LINK_LIBRARIES(f32-vsqrt-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
+  ADD_TEST(f32-vsqrt-test f32-vsqrt-test)
+
   ADD_EXECUTABLE(f32-vsub-minmax-test test/f32-vsub-minmax.cc)
   SET_TARGET_PROPERTIES(f32-vsub-minmax-test PROPERTIES
     CXX_STANDARD 11
@@ -3730,6 +3784,15 @@
   TARGET_INCLUDE_DIRECTORIES(f32-softmax-bench PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
   TARGET_LINK_LIBRARIES(f32-softmax-bench PRIVATE XNNPACK fp16 benchmark bench-utils)
 
+  ADD_EXECUTABLE(f32-vsqrt-bench bench/f32-vsqrt.cc)
+  SET_TARGET_PROPERTIES(f32-vsqrt-bench PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS YES)
+  TARGET_INCLUDE_DIRECTORIES(f32-vsqrt-bench PRIVATE src)
+  TARGET_INCLUDE_DIRECTORIES(f32-vsqrt-bench PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+  TARGET_LINK_LIBRARIES(f32-vsqrt-bench PRIVATE XNNPACK benchmark bench-utils)
+
   ADD_EXECUTABLE(q8-gemm-bench bench/q8-gemm.cc)
   SET_TARGET_PROPERTIES(q8-gemm-bench PROPERTIES
     CXX_STANDARD 11