Add F32 VLRELU benchmarks

- Base on F32 VRELU benchmarks
- Use benchmark::utils::UnaryElementwiseParameters

PiperOrigin-RevId: 389412604
diff --git a/BUILD.bazel b/BUILD.bazel
index acec396..16ff5e6 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -7566,6 +7566,15 @@
 )
 
 xnnpack_benchmark(
+    name = "f32_vlrelu_bench",
+    srcs = [
+        "bench/f32-vlrelu.cc",
+        "src/xnnpack/AlignedAllocator.h",
+    ] + MICROKERNEL_BENCHMARK_HDRS,
+    deps = MICROKERNEL_BENCHMARK_DEPS,
+)
+
+xnnpack_benchmark(
     name = "f32_vrelu_bench",
     srcs = [
         "bench/f32-vrelu.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cfa425e..df0e48c 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7234,6 +7234,14 @@
   TARGET_INCLUDE_DIRECTORIES(f32-vhswish-bench PRIVATE . include src)
   TARGET_LINK_LIBRARIES(f32-vhswish-bench PRIVATE benchmark bench-utils cpuinfo fp16 pthreadpool)
 
+  ADD_EXECUTABLE(f32-vlrelu-bench bench/f32-vlrelu.cc $<TARGET_OBJECTS:all_microkernels>)
+  SET_TARGET_PROPERTIES(f32-vlrelu-bench PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS YES)
+  TARGET_INCLUDE_DIRECTORIES(f32-vlrelu-bench PRIVATE . include src)
+  TARGET_LINK_LIBRARIES(f32-vlrelu-bench PRIVATE benchmark bench-utils cpuinfo fp16 pthreadpool)
+
   ADD_EXECUTABLE(f32-vrelu-bench bench/f32-vrelu.cc $<TARGET_OBJECTS:all_microkernels>)
   SET_TARGET_PROPERTIES(f32-vrelu-bench PROPERTIES
     CXX_STANDARD 11
diff --git a/bench/f32-vlrelu.cc b/bench/f32-vlrelu.cc
new file mode 100644
index 0000000..b0c84f3
--- /dev/null
+++ b/bench/f32-vlrelu.cc
@@ -0,0 +1,171 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+#include "bench/utils.h"
+
+#include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+#include <xnnpack/params.h>
+#include <xnnpack/params-init.h>
+
+
+static void f32_vlrelu(
+  benchmark::State& state,
+  xnn_f32_vlrelu_ukernel_function vlrelu,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check && !isa_check(state)) {
+    return;
+  }
+
+  const size_t elements = state.range(0);
+  std::vector<float, AlignedAllocator<float, 64>> input(elements);
+  std::vector<float, AlignedAllocator<float, 64>> output(elements);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-5.0f, 5.0f), std::ref(rng));
+  std::generate(input.begin(), input.end(), std::ref(f32rng));
+  std::fill(output.begin(), output.end(), std::nanf(""));
+
+  union xnn_f32_lrelu_params params;
+  xnn_init_f32_lrelu_params(&params, 0.01f);
+  for (auto _ : state) {
+    vlrelu(elements * sizeof(float), input.data(), output.data(), &params);
+  }
+
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
+  const size_t elements_per_iteration = elements;
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+
+  const size_t bytes_per_iteration = 2 * elements * sizeof(float);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+
+#if XNN_ARCH_ARM64 || XNN_ARCH_ARM64
+  BENCHMARK_CAPTURE(f32_vlrelu, neon_x4,
+                    xnn_f32_vlrelu_ukernel__neon_x4,
+                    benchmark::utils::CheckNEON)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f32_vlrelu, neon_x8,
+                    xnn_f32_vlrelu_ukernel__neon_x8,
+                    benchmark::utils::CheckNEON)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_ARM64 || XNN_ARCH_ARM64
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  BENCHMARK_CAPTURE(f32_vlrelu, sse_x4,
+                    xnn_f32_vlrelu_ukernel__sse_x4)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f32_vlrelu, sse_x8,
+                    xnn_f32_vlrelu_ukernel__sse_x8)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+
+  BENCHMARK_CAPTURE(f32_vlrelu, sse2_x4,
+                    xnn_f32_vlrelu_ukernel__sse2_x4)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f32_vlrelu, sse2_x8,
+                    xnn_f32_vlrelu_ukernel__sse2_x8)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+
+  BENCHMARK_CAPTURE(f32_vlrelu, sse41_x4,
+                    xnn_f32_vlrelu_ukernel__sse41_x4)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f32_vlrelu, sse41_x8,
+                    xnn_f32_vlrelu_ukernel__sse41_x8)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+
+  BENCHMARK_CAPTURE(f32_vlrelu, avx_x8,
+                    xnn_f32_vlrelu_ukernel__avx_x8)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f32_vlrelu, avx_x16,
+                    xnn_f32_vlrelu_ukernel__avx_x16)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+
+  BENCHMARK_CAPTURE(f32_vlrelu, avx512f_x16,
+                    xnn_f32_vlrelu_ukernel__avx512f_x16)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f32_vlrelu, avx512f_x32,
+                    xnn_f32_vlrelu_ukernel__avx512f_x32)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_WASMSIMD
+  BENCHMARK_CAPTURE(f32_vlrelu, wasmsimd_bitselect_x4,
+                    xnn_f32_vlrelu_ukernel__wasmsimd_bitselect_x4)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f32_vlrelu, wasmsimd_bitselect_x8,
+                    xnn_f32_vlrelu_ukernel__wasmsimd_bitselect_x8)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+
+  BENCHMARK_CAPTURE(f32_vlrelu, wasmsimd_minmax_x4,
+                    xnn_f32_vlrelu_ukernel__wasmsimd_minmax_x4)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f32_vlrelu, wasmsimd_minmax_x8,
+                    xnn_f32_vlrelu_ukernel__wasmsimd_minmax_x8)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_WASMSIMD
+
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  BENCHMARK_CAPTURE(f32_vlrelu, wasm_x1,
+                    xnn_f32_vlrelu_ukernel__wasm_x1)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f32_vlrelu, wasm_x2,
+                    xnn_f32_vlrelu_ukernel__wasm_x2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f32_vlrelu, wasm_x4,
+                    xnn_f32_vlrelu_ukernel__wasm_x4)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+BENCHMARK_CAPTURE(f32_vlrelu, scalar_x1,
+                  xnn_f32_vlrelu_ukernel__scalar_x1)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(f32_vlrelu, scalar_x2,
+                  xnn_f32_vlrelu_ukernel__scalar_x2)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(f32_vlrelu, scalar_x4,
+                  xnn_f32_vlrelu_ukernel__scalar_x4)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+  ->UseRealTime();
+
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif