Vector SQRT microkernels

PiperOrigin-RevId: 319054903
diff --git a/BUILD.bazel b/BUILD.bazel
index a06eb21..4f20594 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -292,6 +292,9 @@
     "src/f32-vrnd/gen/vrndd-scalar-libm-x1.c",
     "src/f32-vrnd/gen/vrndd-scalar-libm-x2.c",
     "src/f32-vrnd/gen/vrndd-scalar-libm-x4.c",
+    "src/f32-vsqrt/gen/scalar-sqrt-x1.c",
+    "src/f32-vsqrt/gen/scalar-sqrt-x2.c",
+    "src/f32-vsqrt/gen/scalar-sqrt-x4.c",
     "src/f32-vunary/gen/vabs-scalar-x1.c",
     "src/f32-vunary/gen/vabs-scalar-x2.c",
     "src/f32-vunary/gen/vabs-scalar-x4.c",
@@ -684,6 +687,8 @@
     "src/f32-vrnd/gen/vrndu-wasmsimd-x8.c",
     "src/f32-vrnd/gen/vrndd-wasmsimd-x4.c",
     "src/f32-vrnd/gen/vrndd-wasmsimd-x8.c",
+    "src/f32-vsqrt/gen/wasmsimd-sqrt-x4.c",
+    "src/f32-vsqrt/gen/wasmsimd-sqrt-x8.c",
     "src/f32-vunary/gen/vabs-wasmsimd-x4.c",
     "src/f32-vunary/gen/vabs-wasmsimd-x8.c",
     "src/f32-vunary/gen/vneg-wasmsimd-x4.c",
@@ -1205,6 +1210,26 @@
     "src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr2recps-x24.c",
     "src/f32-vmulcaddc/gen/c4-minmax-neonfma-2x.c",
     "src/f32-vmulcaddc/gen/c8-minmax-neonfma-2x.c",
+    "src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x4.c",
+    "src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x8.c",
+    "src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x12.c",
+    "src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x16.c",
+    "src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x20.c",
+    "src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x24.c",
+    "src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x28.c",
+    "src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x32.c",
+    "src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x36.c",
+    "src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x40.c",
+    "src/f32-vsqrt/gen/neonfma-nr2fma1adj-x4.c",
+    "src/f32-vsqrt/gen/neonfma-nr2fma1adj-x8.c",
+    "src/f32-vsqrt/gen/neonfma-nr2fma1adj-x12.c",
+    "src/f32-vsqrt/gen/neonfma-nr2fma1adj-x16.c",
+    "src/f32-vsqrt/gen/neonfma-nr2fma1adj-x20.c",
+    "src/f32-vsqrt/gen/neonfma-nr2fma1adj-x24.c",
+    "src/f32-vsqrt/gen/neonfma-nr2fma1adj-x28.c",
+    "src/f32-vsqrt/gen/neonfma-nr2fma1adj-x32.c",
+    "src/f32-vsqrt/gen/neonfma-nr2fma1adj-x36.c",
+    "src/f32-vsqrt/gen/neonfma-nr2fma1adj-x40.c",
     "src/math/exp-neonfma-lut64-p2.c",
     "src/math/exp-neonfma-p5.c",
     "src/math/expminus-neonfma-lut2048-p1.c",
@@ -1311,6 +1336,8 @@
     "src/f32-spmm/gen/8x1-minmax-neonfma.c",
     "src/f32-spmm/gen/8x2-minmax-neonfma.c",
     "src/f32-spmm/gen/8x4-minmax-neonfma.c",
+    "src/f32-vsqrt/gen/neon-sqrt-x4.c",
+    "src/f32-vsqrt/gen/neon-sqrt-x8.c",
     "src/math/sigmoid-neonfma-rr1-lut2048-p1-div.c",
     "src/math/sigmoid-neonfma-rr1-lut64-p2-div.c",
     "src/math/sigmoid-neonfma-rr1-p5-div.c",
@@ -1513,6 +1540,8 @@
     "src/f32-vlrelu/gen/sse-x8.c",
     "src/f32-vmulcaddc/gen/c4-minmax-sse-2x.c",
     "src/f32-vmulcaddc/gen/c8-minmax-sse-2x.c",
+    "src/f32-vsqrt/gen/sse-sqrt-x4.c",
+    "src/f32-vsqrt/gen/sse-sqrt-x8.c",
     "src/f32-vunary/gen/vabs-sse-x4.c",
     "src/f32-vunary/gen/vabs-sse-x8.c",
     "src/f32-vunary/gen/vneg-sse-x4.c",
@@ -1723,6 +1752,8 @@
     "src/f32-vrnd/gen/vrndu-avx-x16.c",
     "src/f32-vrnd/gen/vrndd-avx-x8.c",
     "src/f32-vrnd/gen/vrndd-avx-x16.c",
+    "src/f32-vsqrt/gen/avx-sqrt-x8.c",
+    "src/f32-vsqrt/gen/avx-sqrt-x16.c",
     "src/f32-vunary/gen/vabs-avx-x8.c",
     "src/f32-vunary/gen/vabs-avx-x16.c",
     "src/f32-vunary/gen/vneg-avx-x8.c",
@@ -1788,6 +1819,14 @@
     "src/f32-igemm/gen/3x16s4-minmax-fma3-broadcast.c",
     "src/f32-igemm/gen/4x16s4-minmax-fma3-broadcast.c",
     "src/f32-igemm/gen/5x16s4-minmax-fma3-broadcast.c",
+    "src/f32-vsqrt/gen/fma3-nr1fma1adj-x8.c",
+    "src/f32-vsqrt/gen/fma3-nr1fma1adj-x16.c",
+    "src/f32-vsqrt/gen/fma3-nr1fma1adj-x24.c",
+    "src/f32-vsqrt/gen/fma3-nr1fma1adj-x32.c",
+    "src/f32-vsqrt/gen/fma3-nr1fma1adj-x40.c",
+    "src/f32-vsqrt/gen/fma3-nr1fma1adj-x48.c",
+    "src/f32-vsqrt/gen/fma3-nr1fma1adj-x56.c",
+    "src/f32-vsqrt/gen/fma3-nr1fma1adj-x64.c",
     "src/math/sqrt-fma3-nr1fma.c",
     "src/math/sqrt-fma3-nr2fma.c",
     "src/math/sqrt-fma3-nr1fma1adj.c",
@@ -2038,6 +2077,14 @@
     "src/f32-vrnd/gen/vrndu-avx512f-x32.c",
     "src/f32-vrnd/gen/vrndd-avx512f-x16.c",
     "src/f32-vrnd/gen/vrndd-avx512f-x32.c",
+    "src/f32-vsqrt/gen/avx512f-nr1fma1adj-x16.c",
+    "src/f32-vsqrt/gen/avx512f-nr1fma1adj-x32.c",
+    "src/f32-vsqrt/gen/avx512f-nr1fma1adj-x48.c",
+    "src/f32-vsqrt/gen/avx512f-nr1fma1adj-x64.c",
+    "src/f32-vsqrt/gen/avx512f-nr1fma1adj-x80.c",
+    "src/f32-vsqrt/gen/avx512f-nr1fma1adj-x96.c",
+    "src/f32-vsqrt/gen/avx512f-nr1fma1adj-x112.c",
+    "src/f32-vsqrt/gen/avx512f-nr1fma1adj-x128.c",
     "src/f32-vunary/gen/vabs-avx512f-x16.c",
     "src/f32-vunary/gen/vabs-avx512f-x32.c",
     "src/f32-vunary/gen/vneg-avx512f-x16.c",
@@ -3592,6 +3639,15 @@
 )
 
 xnnpack_benchmark(
+    name = "f32_vsqrt_bench",
+    srcs = [
+        "bench/f32-vsqrt.cc",
+        "src/xnnpack/AlignedAllocator.h",
+    ] + MICROKERNEL_BENCHMARK_HDRS,
+    deps = MICROKERNEL_BENCHMARK_DEPS,
+)
+
+xnnpack_benchmark(
     name = "f32_im2col_gemm_bench",
     srcs = [
         "bench/f32-im2col-gemm.cc",
@@ -4667,6 +4723,15 @@
 )
 
 xnnpack_unit_test(
+    name = "f32_vsqrt_test",
+    srcs = [
+        "test/f32-vsqrt.cc",
+        "test/vunary-microkernel-tester.h",
+    ] + MICROKERNEL_TEST_HDRS,
+    deps = MICROKERNEL_TEST_DEPS,
+)
+
+xnnpack_unit_test(
     name = "f32_vsub_minmax_test",
     srcs = [
         "test/f32-vsub-minmax.cc",