Vector ELU microkernels

PiperOrigin-RevId: 345108685
diff --git a/BUILD.bazel b/BUILD.bazel
index 202e668..fa6109f 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -423,6 +423,18 @@
     "src/f32-vbinary/gen/vsubc-scalar-x2.c",
     "src/f32-vbinary/gen/vsubc-scalar-x4.c",
     "src/f32-vbinary/gen/vsubc-scalar-x8.c",
+    "src/f32-velu/gen/velu-scalar-rr2-lut16-p3-x1.c",
+    "src/f32-velu/gen/velu-scalar-rr2-lut16-p3-x2.c",
+    "src/f32-velu/gen/velu-scalar-rr2-lut16-p3-x3.c",
+    "src/f32-velu/gen/velu-scalar-rr2-lut16-p3-x4.c",
+    "src/f32-velu/gen/velu-scalar-rr2-lut16-p3-x5.c",
+    "src/f32-velu/gen/velu-scalar-rr2-lut16-p3-x6.c",
+    "src/f32-velu/gen/velu-scalar-rr2-p6-x1.c",
+    "src/f32-velu/gen/velu-scalar-rr2-p6-x2.c",
+    "src/f32-velu/gen/velu-scalar-rr2-p6-x3.c",
+    "src/f32-velu/gen/velu-scalar-rr2-p6-x4.c",
+    "src/f32-velu/gen/velu-scalar-rr2-p6-x5.c",
+    "src/f32-velu/gen/velu-scalar-rr2-p6-x6.c",
     "src/f32-vlrelu/gen/vlrelu-scalar-x1.c",
     "src/f32-vlrelu/gen/vlrelu-scalar-x2.c",
     "src/f32-vlrelu/gen/vlrelu-scalar-x4.c",
@@ -694,6 +706,18 @@
     "src/f32-vbinary/gen/vsubc-relu-wasm-x2.c",
     "src/f32-vbinary/gen/vsubc-relu-wasm-x4.c",
     "src/f32-vbinary/gen/vsubc-relu-wasm-x8.c",
+    "src/f32-velu/gen/velu-wasm-rr2-lut16-p3-x1.c",
+    "src/f32-velu/gen/velu-wasm-rr2-lut16-p3-x2.c",
+    "src/f32-velu/gen/velu-wasm-rr2-lut16-p3-x3.c",
+    "src/f32-velu/gen/velu-wasm-rr2-lut16-p3-x4.c",
+    "src/f32-velu/gen/velu-wasm-rr2-lut16-p3-x5.c",
+    "src/f32-velu/gen/velu-wasm-rr2-lut16-p3-x6.c",
+    "src/f32-velu/gen/velu-wasm-rr2-p6-x1.c",
+    "src/f32-velu/gen/velu-wasm-rr2-p6-x2.c",
+    "src/f32-velu/gen/velu-wasm-rr2-p6-x3.c",
+    "src/f32-velu/gen/velu-wasm-rr2-p6-x4.c",
+    "src/f32-velu/gen/velu-wasm-rr2-p6-x5.c",
+    "src/f32-velu/gen/velu-wasm-rr2-p6-x6.c",
     "src/f32-vlrelu/gen/vlrelu-wasm-x1.c",
     "src/f32-vlrelu/gen/vlrelu-wasm-x2.c",
     "src/f32-vlrelu/gen/vlrelu-wasm-x4.c",
@@ -1163,6 +1187,30 @@
     "src/f32-vbinary/gen/vsubc-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vsubc-wasmsimd-x8.c",
     "src/f32-vbinary/gen/vsubc-wasmsimd-x16.c",
+    "src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x4.c",
+    "src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x8.c",
+    "src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x12.c",
+    "src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x16.c",
+    "src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x20.c",
+    "src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x24.c",
+    "src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x4.c",
+    "src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x8.c",
+    "src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x12.c",
+    "src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x16.c",
+    "src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x20.c",
+    "src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x24.c",
+    "src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x4.c",
+    "src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x8.c",
+    "src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x12.c",
+    "src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x16.c",
+    "src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x20.c",
+    "src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x24.c",
+    "src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x4.c",
+    "src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x8.c",
+    "src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x12.c",
+    "src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x16.c",
+    "src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x20.c",
+    "src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x24.c",
     "src/f32-vlrelu/gen/vlrelu-wasmsimd-bitselect-x4.c",
     "src/f32-vlrelu/gen/vlrelu-wasmsimd-bitselect-x8.c",
     "src/f32-vlrelu/gen/vlrelu-wasmsimd-minmax-x4.c",
@@ -1470,6 +1518,18 @@
     "src/f32-vbinary/gen/vsub-minmax-neon-x8.c",
     "src/f32-vbinary/gen/vsubc-minmax-neon-x4.c",
     "src/f32-vbinary/gen/vsubc-minmax-neon-x8.c",
+    "src/f32-velu/gen/velu-neon-rr2-lut16-p3-x4.c",
+    "src/f32-velu/gen/velu-neon-rr2-lut16-p3-x8.c",
+    "src/f32-velu/gen/velu-neon-rr2-lut16-p3-x12.c",
+    "src/f32-velu/gen/velu-neon-rr2-lut16-p3-x16.c",
+    "src/f32-velu/gen/velu-neon-rr2-lut16-p3-x20.c",
+    "src/f32-velu/gen/velu-neon-rr2-lut16-p3-x24.c",
+    "src/f32-velu/gen/velu-neon-rr2-p6-x4.c",
+    "src/f32-velu/gen/velu-neon-rr2-p6-x8.c",
+    "src/f32-velu/gen/velu-neon-rr2-p6-x12.c",
+    "src/f32-velu/gen/velu-neon-rr2-p6-x16.c",
+    "src/f32-velu/gen/velu-neon-rr2-p6-x20.c",
+    "src/f32-velu/gen/velu-neon-rr2-p6-x24.c",
     "src/f32-vlrelu/gen/vlrelu-neon-x4.c",
     "src/f32-vlrelu/gen/vlrelu-neon-x8.c",
     "src/f32-vmulcaddc/gen/c4-minmax-neon-2x.c",
@@ -1690,6 +1750,18 @@
     "src/f32-sigmoid/gen/neonfma-rr1-p5-nr2recps-x16.c",
     "src/f32-sigmoid/gen/neonfma-rr1-p5-nr2recps-x20.c",
     "src/f32-sigmoid/gen/neonfma-rr1-p5-nr2recps-x24.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-lut16-p3-x4.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-lut16-p3-x8.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-lut16-p3-x12.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-lut16-p3-x16.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-lut16-p3-x20.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-lut16-p3-x24.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-p6-x4.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-p6-x8.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-p6-x12.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-p6-x16.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-p6-x20.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-p6-x24.c",
     "src/f32-vmulcaddc/gen/c4-minmax-neonfma-2x.c",
     "src/f32-vmulcaddc/gen/c8-minmax-neonfma-2x.c",
     "src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x4.c",
@@ -2181,6 +2253,18 @@
     "src/f32-sigmoid/gen/sse2-p5-div-x16.c",
     "src/f32-sigmoid/gen/sse2-p5-div-x20.c",
     "src/f32-sigmoid/gen/sse2-p5-div-x24.c",
+    "src/f32-velu/gen/velu-sse2-rr2-lut16-p3-x4.c",
+    "src/f32-velu/gen/velu-sse2-rr2-lut16-p3-x8.c",
+    "src/f32-velu/gen/velu-sse2-rr2-lut16-p3-x12.c",
+    "src/f32-velu/gen/velu-sse2-rr2-lut16-p3-x16.c",
+    "src/f32-velu/gen/velu-sse2-rr2-lut16-p3-x20.c",
+    "src/f32-velu/gen/velu-sse2-rr2-lut16-p3-x24.c",
+    "src/f32-velu/gen/velu-sse2-rr2-p6-x4.c",
+    "src/f32-velu/gen/velu-sse2-rr2-p6-x8.c",
+    "src/f32-velu/gen/velu-sse2-rr2-p6-x12.c",
+    "src/f32-velu/gen/velu-sse2-rr2-p6-x16.c",
+    "src/f32-velu/gen/velu-sse2-rr2-p6-x20.c",
+    "src/f32-velu/gen/velu-sse2-rr2-p6-x24.c",
     "src/f32-vlrelu/gen/vlrelu-sse2-x4.c",
     "src/f32-vlrelu/gen/vlrelu-sse2-x8.c",
     "src/f32-vrnd/gen/vrndd-sse2-x4.c",
@@ -2343,6 +2427,18 @@
     "src/f32-sigmoid/gen/sse41-p5-div-x16.c",
     "src/f32-sigmoid/gen/sse41-p5-div-x20.c",
     "src/f32-sigmoid/gen/sse41-p5-div-x24.c",
+    "src/f32-velu/gen/velu-sse41-rr2-lut16-p3-x4.c",
+    "src/f32-velu/gen/velu-sse41-rr2-lut16-p3-x8.c",
+    "src/f32-velu/gen/velu-sse41-rr2-lut16-p3-x12.c",
+    "src/f32-velu/gen/velu-sse41-rr2-lut16-p3-x16.c",
+    "src/f32-velu/gen/velu-sse41-rr2-lut16-p3-x20.c",
+    "src/f32-velu/gen/velu-sse41-rr2-lut16-p3-x24.c",
+    "src/f32-velu/gen/velu-sse41-rr2-p6-x4.c",
+    "src/f32-velu/gen/velu-sse41-rr2-p6-x8.c",
+    "src/f32-velu/gen/velu-sse41-rr2-p6-x12.c",
+    "src/f32-velu/gen/velu-sse41-rr2-p6-x16.c",
+    "src/f32-velu/gen/velu-sse41-rr2-p6-x20.c",
+    "src/f32-velu/gen/velu-sse41-rr2-p6-x24.c",
     "src/f32-vlrelu/gen/vlrelu-sse41-x4.c",
     "src/f32-vlrelu/gen/vlrelu-sse41-x8.c",
     "src/f32-vrnd/gen/vrndd-sse41-x4.c",
@@ -2515,6 +2611,24 @@
     "src/f32-vbinary/gen/vsub-minmax-avx-x16.c",
     "src/f32-vbinary/gen/vsubc-minmax-avx-x8.c",
     "src/f32-vbinary/gen/vsubc-minmax-avx-x16.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut4-p4-perm-x8.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut4-p4-perm-x16.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut4-p4-perm-x24.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut4-p4-perm-x32.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut4-p4-perm-x40.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut4-p4-perm-x48.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut16-p3-x8.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut16-p3-x16.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut16-p3-x24.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut16-p3-x32.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut16-p3-x40.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut16-p3-x48.c",
+    "src/f32-velu/gen/velu-avx-rr2-p6-x8.c",
+    "src/f32-velu/gen/velu-avx-rr2-p6-x16.c",
+    "src/f32-velu/gen/velu-avx-rr2-p6-x24.c",
+    "src/f32-velu/gen/velu-avx-rr2-p6-x32.c",
+    "src/f32-velu/gen/velu-avx-rr2-p6-x40.c",
+    "src/f32-velu/gen/velu-avx-rr2-p6-x48.c",
     "src/f32-vlrelu/gen/vlrelu-avx-x8.c",
     "src/f32-vlrelu/gen/vlrelu-avx-x16.c",
     "src/f32-vrnd/gen/vrndd-avx-x8.c",
@@ -2717,6 +2831,46 @@
     "src/f32-sigmoid/gen/avx2-rr1-p5-nr2fma-x64.c",
     "src/f32-sigmoid/gen/avx2-rr1-p5-nr2fma-x72.c",
     "src/f32-sigmoid/gen/avx2-rr1-p5-nr2fma-x80.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x8.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x16.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x24.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x32.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x40.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x48.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x56.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x64.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x72.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x80.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x8.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x16.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x24.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x32.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x40.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x48.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x56.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x64.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x72.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x80.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x8.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x16.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x24.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x32.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x40.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x48.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x56.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x64.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x72.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x80.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x8.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x16.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x24.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x32.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x40.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x48.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x56.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x64.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x72.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x80.c",
     "src/f32-vscaleexpminusmax/gen/avx2-p5-x8.c",
     "src/f32-vscaleexpminusmax/gen/avx2-p5-x16.c",
     "src/f32-vscaleexpminusmax/gen/avx2-p5-x24.c",
@@ -2945,6 +3099,22 @@
     "src/f32-vbinary/gen/vsub-minmax-avx512f-x32.c",
     "src/f32-vbinary/gen/vsubc-minmax-avx512f-x16.c",
     "src/f32-vbinary/gen/vsubc-minmax-avx512f-x32.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x16.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x32.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x48.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x64.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x80.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x96.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x112.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x128.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-p6-x16.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-p6-x32.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-p6-x48.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-p6-x64.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-p6-x80.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-p6-x96.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-p6-x112.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-p6-x128.c",
     "src/f32-vlrelu/gen/vlrelu-avx512f-x16.c",
     "src/f32-vlrelu/gen/vlrelu-avx512f-x32.c",
     "src/f32-vrnd/gen/vrndd-avx512f-x16.c",
@@ -4773,6 +4943,15 @@
 )
 
 xnnpack_benchmark(
+    name = "f32_velu_bench",
+    srcs = [
+        "bench/f32-velu.cc",
+        "src/xnnpack/AlignedAllocator.h",
+    ] + MICROKERNEL_BENCHMARK_HDRS,
+    deps = MICROKERNEL_BENCHMARK_DEPS,
+)
+
+xnnpack_benchmark(
     name = "f32_vscaleexpminusmax_bench",
     srcs = [
         "bench/f32-vscaleexpminusmax.cc",
@@ -5868,6 +6047,15 @@
 )
 
 xnnpack_unit_test(
+    name = "f32_velu_test",
+    srcs = [
+        "test/f32-velu.cc",
+        "test/vunary-microkernel-tester.h",
+    ] + MICROKERNEL_TEST_HDRS,
+    deps = MICROKERNEL_TEST_DEPS,
+)
+
+xnnpack_unit_test(
     name = "f32_vmax_test",
     srcs = [
         "test/f32-vmax.cc",