AVX2 QS8->F32 and QU8->F32 VCVT microkernels
PiperOrigin-RevId: 416487347
diff --git a/BUILD.bazel b/BUILD.bazel
index d2a62cb..5092e73 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -5649,6 +5649,7 @@
"src/qc8-igemm/gen/3x8c8-minmax-fp32-avx2.c",
"src/qs8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c",
"src/qs8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c",
+ "src/qs8-f32-vcvt/gen/vcvt-avx2-x16.c",
"src/qs8-gemm/gen/1x8c8-minmax-fp32-avx2.c",
"src/qs8-gemm/gen/3x8c8-minmax-fp32-avx2.c",
"src/qs8-igemm/gen/1x8c8-minmax-fp32-avx2.c",
@@ -5657,6 +5658,7 @@
"src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c",
"src/qu8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c",
"src/qu8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c",
+ "src/qu8-f32-vcvt/gen/vcvt-avx2-x16.c",
"src/qu8-gemm/gen/1x8c8-minmax-fp32-avx2.c",
"src/qu8-gemm/gen/3x8c8-minmax-fp32-avx2.c",
"src/qu8-igemm/gen/1x8c8-minmax-fp32-avx2.c",
@@ -5877,6 +5879,10 @@
"src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16-vpmovsx.c",
"src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16-vpunpck.c",
"src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul32.c",
+ "src/qs8-f32-vcvt/gen/vcvt-avx2-x8.c",
+ "src/qs8-f32-vcvt/gen/vcvt-avx2-x16.c",
+ "src/qs8-f32-vcvt/gen/vcvt-avx2-x24.c",
+ "src/qs8-f32-vcvt/gen/vcvt-avx2-x32.c",
"src/qs8-gemm/gen/1x8c8-minmax-fp32-avx2.c",
"src/qs8-gemm/gen/1x8c8-xw-minmax-fp32-avx2.c",
"src/qs8-gemm/gen/2x8c8-minmax-fp32-avx2.c",
@@ -5900,6 +5906,10 @@
"src/qu8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c",
"src/qu8-dwconv/gen/up32x9-minmax-fp32-avx2-mul32.c",
"src/qu8-dwconv/gen/up32x25-minmax-fp32-avx2-mul32.c",
+ "src/qu8-f32-vcvt/gen/vcvt-avx2-x8.c",
+ "src/qu8-f32-vcvt/gen/vcvt-avx2-x16.c",
+ "src/qu8-f32-vcvt/gen/vcvt-avx2-x24.c",
+ "src/qu8-f32-vcvt/gen/vcvt-avx2-x32.c",
"src/qu8-gemm/gen/1x8c8-minmax-fp32-avx2.c",
"src/qu8-gemm/gen/2x8c8-minmax-fp32-avx2.c",
"src/qu8-gemm/gen/3x8c8-minmax-fp32-avx2.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3cc6206..42ca9c5 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4577,6 +4577,7 @@
src/qc8-igemm/gen/3x8c8-minmax-fp32-avx2.c
src/qs8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c
src/qs8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c
+ src/qs8-f32-vcvt/gen/vcvt-avx2-x16.c
src/qs8-gemm/gen/1x8c8-minmax-fp32-avx2.c
src/qs8-gemm/gen/3x8c8-minmax-fp32-avx2.c
src/qs8-igemm/gen/1x8c8-minmax-fp32-avx2.c
@@ -4585,6 +4586,7 @@
src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c
src/qu8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c
src/qu8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c
+ src/qu8-f32-vcvt/gen/vcvt-avx2-x16.c
src/qu8-gemm/gen/1x8c8-minmax-fp32-avx2.c
src/qu8-gemm/gen/3x8c8-minmax-fp32-avx2.c
src/qu8-igemm/gen/1x8c8-minmax-fp32-avx2.c
@@ -4806,6 +4808,10 @@
src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16-vpmovsx.c
src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16-vpunpck.c
src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul32.c
+ src/qs8-f32-vcvt/gen/vcvt-avx2-x8.c
+ src/qs8-f32-vcvt/gen/vcvt-avx2-x16.c
+ src/qs8-f32-vcvt/gen/vcvt-avx2-x24.c
+ src/qs8-f32-vcvt/gen/vcvt-avx2-x32.c
src/qs8-gemm/gen/1x8c8-minmax-fp32-avx2.c
src/qs8-gemm/gen/1x8c8-xw-minmax-fp32-avx2.c
src/qs8-gemm/gen/2x8c8-minmax-fp32-avx2.c
@@ -4829,6 +4835,10 @@
src/qu8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c
src/qu8-dwconv/gen/up32x9-minmax-fp32-avx2-mul32.c
src/qu8-dwconv/gen/up32x25-minmax-fp32-avx2-mul32.c
+ src/qu8-f32-vcvt/gen/vcvt-avx2-x8.c
+ src/qu8-f32-vcvt/gen/vcvt-avx2-x16.c
+ src/qu8-f32-vcvt/gen/vcvt-avx2-x24.c
+ src/qu8-f32-vcvt/gen/vcvt-avx2-x32.c
src/qu8-gemm/gen/1x8c8-minmax-fp32-avx2.c
src/qu8-gemm/gen/2x8c8-minmax-fp32-avx2.c
src/qu8-gemm/gen/3x8c8-minmax-fp32-avx2.c
diff --git a/bench/qs8-f32-vcvt.cc b/bench/qs8-f32-vcvt.cc
index 6830731..bb8ad27 100644
--- a/bench/qs8-f32-vcvt.cc
+++ b/bench/qs8-f32-vcvt.cc
@@ -93,6 +93,31 @@
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ BENCHMARK_CAPTURE(qs8_f32_vcvt, avx2_x8,
+ xnn_qs8_f32_vcvt_ukernel__avx2_x8,
+ xnn_init_qs8_f32_cvt_avx_params,
+ benchmark::utils::CheckAVX2)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(qs8_f32_vcvt, avx2_x16,
+ xnn_qs8_f32_vcvt_ukernel__avx2_x16,
+ xnn_init_qs8_f32_cvt_avx_params,
+ benchmark::utils::CheckAVX2)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(qs8_f32_vcvt, avx2_x24,
+ xnn_qs8_f32_vcvt_ukernel__avx2_x24,
+ xnn_init_qs8_f32_cvt_avx_params,
+ benchmark::utils::CheckAVX2)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(qs8_f32_vcvt, avx2_x32,
+ xnn_qs8_f32_vcvt_ukernel__avx2_x32,
+ xnn_init_qs8_f32_cvt_avx_params,
+ benchmark::utils::CheckAVX2)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+
BENCHMARK_CAPTURE(qs8_f32_vcvt, avx_x8,
xnn_qs8_f32_vcvt_ukernel__avx_x8,
xnn_init_qs8_f32_cvt_avx_params,
diff --git a/bench/qu8-f32-vcvt.cc b/bench/qu8-f32-vcvt.cc
index 43e62bc..9400471 100644
--- a/bench/qu8-f32-vcvt.cc
+++ b/bench/qu8-f32-vcvt.cc
@@ -93,6 +93,31 @@
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ BENCHMARK_CAPTURE(qu8_f32_vcvt, avx2_x8,
+ xnn_qu8_f32_vcvt_ukernel__avx2_x8,
+ xnn_init_qu8_f32_cvt_avx_params,
+ benchmark::utils::CheckAVX2)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(qu8_f32_vcvt, avx2_x16,
+ xnn_qu8_f32_vcvt_ukernel__avx2_x16,
+ xnn_init_qu8_f32_cvt_avx_params,
+ benchmark::utils::CheckAVX2)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(qu8_f32_vcvt, avx2_x24,
+ xnn_qu8_f32_vcvt_ukernel__avx2_x24,
+ xnn_init_qu8_f32_cvt_avx_params,
+ benchmark::utils::CheckAVX2)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(qu8_f32_vcvt, avx2_x32,
+ xnn_qu8_f32_vcvt_ukernel__avx2_x32,
+ xnn_init_qu8_f32_cvt_avx_params,
+ benchmark::utils::CheckAVX2)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
+ ->UseRealTime();
+
BENCHMARK_CAPTURE(qu8_f32_vcvt, avx_x8,
xnn_qu8_f32_vcvt_ukernel__avx_x8,
xnn_init_qu8_f32_cvt_avx_params,
diff --git a/scripts/generate-qs8-f32-vcvt.sh b/scripts/generate-qs8-f32-vcvt.sh
index c355300..c7be7de 100755
--- a/scripts/generate-qs8-f32-vcvt.sh
+++ b/scripts/generate-qs8-f32-vcvt.sh
@@ -47,6 +47,16 @@
tools/xngen src/qs8-f32-vcvt/avx.c.in -D BATCH_TILE=24 -D DATATYPE=QU8 -o src/qu8-f32-vcvt/gen/vcvt-avx-x24.c &
tools/xngen src/qs8-f32-vcvt/avx.c.in -D BATCH_TILE=32 -D DATATYPE=QU8 -o src/qu8-f32-vcvt/gen/vcvt-avx-x32.c &
+tools/xngen src/qs8-f32-vcvt/avx2.c.in -D BATCH_TILE=8 -D DATATYPE=QS8 -o src/qs8-f32-vcvt/gen/vcvt-avx2-x8.c &
+tools/xngen src/qs8-f32-vcvt/avx2.c.in -D BATCH_TILE=16 -D DATATYPE=QS8 -o src/qs8-f32-vcvt/gen/vcvt-avx2-x16.c &
+tools/xngen src/qs8-f32-vcvt/avx2.c.in -D BATCH_TILE=24 -D DATATYPE=QS8 -o src/qs8-f32-vcvt/gen/vcvt-avx2-x24.c &
+tools/xngen src/qs8-f32-vcvt/avx2.c.in -D BATCH_TILE=32 -D DATATYPE=QS8 -o src/qs8-f32-vcvt/gen/vcvt-avx2-x32.c &
+
+tools/xngen src/qs8-f32-vcvt/avx2.c.in -D BATCH_TILE=8 -D DATATYPE=QU8 -o src/qu8-f32-vcvt/gen/vcvt-avx2-x8.c &
+tools/xngen src/qs8-f32-vcvt/avx2.c.in -D BATCH_TILE=16 -D DATATYPE=QU8 -o src/qu8-f32-vcvt/gen/vcvt-avx2-x16.c &
+tools/xngen src/qs8-f32-vcvt/avx2.c.in -D BATCH_TILE=24 -D DATATYPE=QU8 -o src/qu8-f32-vcvt/gen/vcvt-avx2-x24.c &
+tools/xngen src/qs8-f32-vcvt/avx2.c.in -D BATCH_TILE=32 -D DATATYPE=QU8 -o src/qu8-f32-vcvt/gen/vcvt-avx2-x32.c &
+
################################## WAsm SIMD ##################################
tools/xngen src/qs8-f32-vcvt/wasmsimd.c.in -D BATCH_TILE=8 -D DATATYPE=QS8 -o src/qs8-f32-vcvt/gen/vcvt-wasmsimd-x8.c &
tools/xngen src/qs8-f32-vcvt/wasmsimd.c.in -D BATCH_TILE=16 -D DATATYPE=QS8 -o src/qs8-f32-vcvt/gen/vcvt-wasmsimd-x16.c &
diff --git a/src/init.c b/src/init.c
index ea8a690..bbdbd21 100644
--- a/src/init.c
+++ b/src/init.c
@@ -3573,7 +3573,18 @@
.element_tile = 32,
};
}
- if (cpuinfo_has_x86_avx()) {
+ if (cpuinfo_has_x86_avx2()) {
+ xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx2_x16,
+ .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
+ .element_tile = 16,
+ };
+ xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx2_x16,
+ .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params,
+ .element_tile = 16,
+ };
+ } else if (cpuinfo_has_x86_avx()) {
xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
.ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx_x32,
.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
diff --git a/src/params-init.c b/src/params-init.c
index 16c02a0..bcf0cb0 100644
--- a/src/params-init.c
+++ b/src/params-init.c
@@ -3262,10 +3262,8 @@
float scale,
int8_t zero_point)
{
- for (uint32_t i = 0; i < 4; i++) {
- params->avx.minus_zero_point[i] = -(int32_t) zero_point;
- }
for (uint32_t i = 0; i < 8; i++) {
+ params->avx.minus_zero_point[i] = -(int32_t) zero_point;
params->avx.scale[i] = scale;
}
}
@@ -3339,10 +3337,8 @@
float scale,
uint8_t zero_point)
{
- for (uint32_t i = 0; i < 4; i++) {
- params->avx.minus_zero_point[i] = -(int32_t) zero_point;
- }
for (uint32_t i = 0; i < 8; i++) {
+ params->avx.minus_zero_point[i] = -(int32_t) zero_point;
params->avx.scale[i] = scale;
}
}
diff --git a/src/qs8-f32-vcvt/avx2.c.in b/src/qs8-f32-vcvt/avx2.c.in
new file mode 100644
index 0000000..cf94031
--- /dev/null
+++ b/src/qs8-f32-vcvt/avx2.c.in
@@ -0,0 +1,90 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert BATCH_TILE % 8 == 0
+$assert BATCH_TILE >= 8
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vcvt.h>
+
+
+$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
+$_MM256_CVTEPX8_EPI32 = {"QS8": "_mm256_cvtepi8_epi32", "QU8": "_mm256_cvtepu8_epi32"}[DATATYPE]
+$_MM_CVTEPX8_EPI32 = {"QS8": "_mm_cvtepi8_epi32", "QU8": "_mm_cvtepu8_epi32"}[DATATYPE]
+void xnn_${DATATYPE.lower()}_f32_vcvt_ukernel__avx2_x${BATCH_TILE}(
+ size_t n,
+ const ${XINT8_T}* x,
+ float* y,
+ const union xnn_${DATATYPE.lower()}_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(${XINT8_T}) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ for (; n >= ${BATCH_TILE} * sizeof(${XINT8_T}); n -= ${BATCH_TILE} * sizeof(${XINT8_T})) {
+ __m256i vx${ABC[0:8]} = ${_MM256_CVTEPX8_EPI32}(_mm_loadl_epi64((const __m128i*) x));
+ $for N in range(8, BATCH_TILE, 8):
+ __m256i vx${ABC[N:N+8]} = ${_MM256_CVTEPX8_EPI32}(_mm_loadl_epi64((const __m128i*) (x + ${N})));
+ x += ${BATCH_TILE};
+
+ $for N in range(0, BATCH_TILE, 8):
+ vx${ABC[N:N+8]} = _mm256_add_epi32(vx${ABC[N:N+8]}, vminus_zero_point);
+
+ $for N in range(0, BATCH_TILE, 8):
+ __m256 vy${ABC[N:N+8]} = _mm256_cvtepi32_ps(vx${ABC[N:N+8]});
+
+ $for N in range(0, BATCH_TILE, 8):
+ vy${ABC[N:N+8]} = _mm256_mul_ps(vy${ABC[N:N+8]}, vscale);
+
+ _mm256_storeu_ps(y, vy${ABC[0:8]});
+ $for N in range(8, BATCH_TILE, 8):
+ _mm256_storeu_ps(y + ${N}, vy${ABC[N:N+8]});
+ y += ${BATCH_TILE};
+ }
+ for (; n >= 8 * sizeof(${XINT8_T}); n -= 8 * sizeof(${XINT8_T})) {
+ __m256i vx = ${_MM256_CVTEPX8_EPI32}(_mm_loadl_epi64((const __m128i*) x));
+ vx = _mm256_add_epi32(vx, vminus_zero_point);
+ x += 8;
+
+ __m256 vy = _mm256_cvtepi32_ps(vx);
+ vy = _mm256_mul_ps(vy, vscale);
+
+ _mm256_storeu_ps(y, vy);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(${XINT8_T}));
+ assert(n <= 7 * sizeof(${XINT8_T}));
+
+ __m256i vx = ${_MM256_CVTEPX8_EPI32}(_mm_loadl_epi64((const __m128i*) x));
+ vx = _mm256_add_epi32(vx, vminus_zero_point);
+
+ __m256 vy = _mm256_cvtepi32_ps(vx);
+ vy = _mm256_mul_ps(vy, vscale);
+
+ __m128 vy_lo = _mm256_castps256_ps128(vy);
+ if (n & (4 * sizeof(${XINT8_T}))) {
+ _mm_storeu_ps(y, vy_lo);
+ vy_lo = _mm256_extractf128_ps(vy, 1);
+ y += 4;
+ }
+ if (n & (2 * sizeof(${XINT8_T}))) {
+ _mm_storel_pi((__m64*) y, vy_lo);
+ vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
+ y += 2;
+ }
+ if (n & (1 * sizeof(${XINT8_T}))) {
+ _mm_store_ss(y, vy_lo);
+ }
+ }
+}
diff --git a/src/qs8-f32-vcvt/gen/vcvt-avx2-x16.c b/src/qs8-f32-vcvt/gen/vcvt-avx2-x16.c
new file mode 100644
index 0000000..09e4f57
--- /dev/null
+++ b/src/qs8-f32-vcvt/gen/vcvt-avx2-x16.c
@@ -0,0 +1,86 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-f32-vcvt/avx2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qs8_f32_vcvt_ukernel__avx2_x16(
+ size_t n,
+ const int8_t* x,
+ float* y,
+ const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(int8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
+ __m256i vx01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ __m256i vx89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (x + 8)));
+ x += 16;
+
+ vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
+ vx89ABCDEF = _mm256_add_epi32(vx89ABCDEF, vminus_zero_point);
+
+ __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
+ __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
+
+ vy01234567 = _mm256_mul_ps(vy01234567, vscale);
+ vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
+
+ _mm256_storeu_ps(y, vy01234567);
+ _mm256_storeu_ps(y + 8, vy89ABCDEF);
+ y += 16;
+ }
+ for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+ __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ vx = _mm256_add_epi32(vx, vminus_zero_point);
+ x += 8;
+
+ __m256 vy = _mm256_cvtepi32_ps(vx);
+ vy = _mm256_mul_ps(vy, vscale);
+
+ _mm256_storeu_ps(y, vy);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(int8_t));
+ assert(n <= 7 * sizeof(int8_t));
+
+ __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ vx = _mm256_add_epi32(vx, vminus_zero_point);
+
+ __m256 vy = _mm256_cvtepi32_ps(vx);
+ vy = _mm256_mul_ps(vy, vscale);
+
+ __m128 vy_lo = _mm256_castps256_ps128(vy);
+ if (n & (4 * sizeof(int8_t))) {
+ _mm_storeu_ps(y, vy_lo);
+ vy_lo = _mm256_extractf128_ps(vy, 1);
+ y += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ _mm_storel_pi((__m64*) y, vy_lo);
+ vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
+ y += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ _mm_store_ss(y, vy_lo);
+ }
+ }
+}
diff --git a/src/qs8-f32-vcvt/gen/vcvt-avx2-x24.c b/src/qs8-f32-vcvt/gen/vcvt-avx2-x24.c
new file mode 100644
index 0000000..a433dd9
--- /dev/null
+++ b/src/qs8-f32-vcvt/gen/vcvt-avx2-x24.c
@@ -0,0 +1,91 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-f32-vcvt/avx2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qs8_f32_vcvt_ukernel__avx2_x24(
+ size_t n,
+ const int8_t* x,
+ float* y,
+ const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(int8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ for (; n >= 24 * sizeof(int8_t); n -= 24 * sizeof(int8_t)) {
+ __m256i vx01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ __m256i vx89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (x + 8)));
+ __m256i vxGHIJKLMN = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (x + 16)));
+ x += 24;
+
+ vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
+ vx89ABCDEF = _mm256_add_epi32(vx89ABCDEF, vminus_zero_point);
+ vxGHIJKLMN = _mm256_add_epi32(vxGHIJKLMN, vminus_zero_point);
+
+ __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
+ __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
+ __m256 vyGHIJKLMN = _mm256_cvtepi32_ps(vxGHIJKLMN);
+
+ vy01234567 = _mm256_mul_ps(vy01234567, vscale);
+ vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
+ vyGHIJKLMN = _mm256_mul_ps(vyGHIJKLMN, vscale);
+
+ _mm256_storeu_ps(y, vy01234567);
+ _mm256_storeu_ps(y + 8, vy89ABCDEF);
+ _mm256_storeu_ps(y + 16, vyGHIJKLMN);
+ y += 24;
+ }
+ for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+ __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ vx = _mm256_add_epi32(vx, vminus_zero_point);
+ x += 8;
+
+ __m256 vy = _mm256_cvtepi32_ps(vx);
+ vy = _mm256_mul_ps(vy, vscale);
+
+ _mm256_storeu_ps(y, vy);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(int8_t));
+ assert(n <= 7 * sizeof(int8_t));
+
+ __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ vx = _mm256_add_epi32(vx, vminus_zero_point);
+
+ __m256 vy = _mm256_cvtepi32_ps(vx);
+ vy = _mm256_mul_ps(vy, vscale);
+
+ __m128 vy_lo = _mm256_castps256_ps128(vy);
+ if (n & (4 * sizeof(int8_t))) {
+ _mm_storeu_ps(y, vy_lo);
+ vy_lo = _mm256_extractf128_ps(vy, 1);
+ y += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ _mm_storel_pi((__m64*) y, vy_lo);
+ vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
+ y += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ _mm_store_ss(y, vy_lo);
+ }
+ }
+}
diff --git a/src/qs8-f32-vcvt/gen/vcvt-avx2-x32.c b/src/qs8-f32-vcvt/gen/vcvt-avx2-x32.c
new file mode 100644
index 0000000..19d3940
--- /dev/null
+++ b/src/qs8-f32-vcvt/gen/vcvt-avx2-x32.c
@@ -0,0 +1,96 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-f32-vcvt/avx2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qs8_f32_vcvt_ukernel__avx2_x32(
+ size_t n,
+ const int8_t* x,
+ float* y,
+ const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(int8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
+ __m256i vx01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ __m256i vx89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (x + 8)));
+ __m256i vxGHIJKLMN = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (x + 16)));
+ __m256i vxOPQRSTUV = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (x + 24)));
+ x += 32;
+
+ vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
+ vx89ABCDEF = _mm256_add_epi32(vx89ABCDEF, vminus_zero_point);
+ vxGHIJKLMN = _mm256_add_epi32(vxGHIJKLMN, vminus_zero_point);
+ vxOPQRSTUV = _mm256_add_epi32(vxOPQRSTUV, vminus_zero_point);
+
+ __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
+ __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
+ __m256 vyGHIJKLMN = _mm256_cvtepi32_ps(vxGHIJKLMN);
+ __m256 vyOPQRSTUV = _mm256_cvtepi32_ps(vxOPQRSTUV);
+
+ vy01234567 = _mm256_mul_ps(vy01234567, vscale);
+ vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
+ vyGHIJKLMN = _mm256_mul_ps(vyGHIJKLMN, vscale);
+ vyOPQRSTUV = _mm256_mul_ps(vyOPQRSTUV, vscale);
+
+ _mm256_storeu_ps(y, vy01234567);
+ _mm256_storeu_ps(y + 8, vy89ABCDEF);
+ _mm256_storeu_ps(y + 16, vyGHIJKLMN);
+ _mm256_storeu_ps(y + 24, vyOPQRSTUV);
+ y += 32;
+ }
+ for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+ __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ vx = _mm256_add_epi32(vx, vminus_zero_point);
+ x += 8;
+
+ __m256 vy = _mm256_cvtepi32_ps(vx);
+ vy = _mm256_mul_ps(vy, vscale);
+
+ _mm256_storeu_ps(y, vy);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(int8_t));
+ assert(n <= 7 * sizeof(int8_t));
+
+ __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ vx = _mm256_add_epi32(vx, vminus_zero_point);
+
+ __m256 vy = _mm256_cvtepi32_ps(vx);
+ vy = _mm256_mul_ps(vy, vscale);
+
+ __m128 vy_lo = _mm256_castps256_ps128(vy);
+ if (n & (4 * sizeof(int8_t))) {
+ _mm_storeu_ps(y, vy_lo);
+ vy_lo = _mm256_extractf128_ps(vy, 1);
+ y += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ _mm_storel_pi((__m64*) y, vy_lo);
+ vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
+ y += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ _mm_store_ss(y, vy_lo);
+ }
+ }
+}
diff --git a/src/qs8-f32-vcvt/gen/vcvt-avx2-x8.c b/src/qs8-f32-vcvt/gen/vcvt-avx2-x8.c
new file mode 100644
index 0000000..aafea27
--- /dev/null
+++ b/src/qs8-f32-vcvt/gen/vcvt-avx2-x8.c
@@ -0,0 +1,81 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-f32-vcvt/avx2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qs8_f32_vcvt_ukernel__avx2_x8(
+ size_t n,
+ const int8_t* x,
+ float* y,
+ const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(int8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+ __m256i vx01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ x += 8;
+
+ vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
+
+ __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
+
+ vy01234567 = _mm256_mul_ps(vy01234567, vscale);
+
+ _mm256_storeu_ps(y, vy01234567);
+ y += 8;
+ }
+ for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+ __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ vx = _mm256_add_epi32(vx, vminus_zero_point);
+ x += 8;
+
+ __m256 vy = _mm256_cvtepi32_ps(vx);
+ vy = _mm256_mul_ps(vy, vscale);
+
+ _mm256_storeu_ps(y, vy);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(int8_t));
+ assert(n <= 7 * sizeof(int8_t));
+
+ __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ vx = _mm256_add_epi32(vx, vminus_zero_point);
+
+ __m256 vy = _mm256_cvtepi32_ps(vx);
+ vy = _mm256_mul_ps(vy, vscale);
+
+ __m128 vy_lo = _mm256_castps256_ps128(vy);
+ if (n & (4 * sizeof(int8_t))) {
+ _mm_storeu_ps(y, vy_lo);
+ vy_lo = _mm256_extractf128_ps(vy, 1);
+ y += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ _mm_storel_pi((__m64*) y, vy_lo);
+ vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
+ y += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ _mm_store_ss(y, vy_lo);
+ }
+ }
+}
diff --git a/src/qu8-f32-vcvt/gen/vcvt-avx2-x16.c b/src/qu8-f32-vcvt/gen/vcvt-avx2-x16.c
new file mode 100644
index 0000000..62deb0f
--- /dev/null
+++ b/src/qu8-f32-vcvt/gen/vcvt-avx2-x16.c
@@ -0,0 +1,86 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-f32-vcvt/avx2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qu8_f32_vcvt_ukernel__avx2_x16(
+ size_t n,
+ const uint8_t* x,
+ float* y,
+ const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(uint8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+ __m256i vx01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ __m256i vx89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (x + 8)));
+ x += 16;
+
+ vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
+ vx89ABCDEF = _mm256_add_epi32(vx89ABCDEF, vminus_zero_point);
+
+ __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
+ __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
+
+ vy01234567 = _mm256_mul_ps(vy01234567, vscale);
+ vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
+
+ _mm256_storeu_ps(y, vy01234567);
+ _mm256_storeu_ps(y + 8, vy89ABCDEF);
+ y += 16;
+ }
+ for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+ __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ vx = _mm256_add_epi32(vx, vminus_zero_point);
+ x += 8;
+
+ __m256 vy = _mm256_cvtepi32_ps(vx);
+ vy = _mm256_mul_ps(vy, vscale);
+
+ _mm256_storeu_ps(y, vy);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(uint8_t));
+ assert(n <= 7 * sizeof(uint8_t));
+
+ __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ vx = _mm256_add_epi32(vx, vminus_zero_point);
+
+ __m256 vy = _mm256_cvtepi32_ps(vx);
+ vy = _mm256_mul_ps(vy, vscale);
+
+ __m128 vy_lo = _mm256_castps256_ps128(vy);
+ if (n & (4 * sizeof(uint8_t))) {
+ _mm_storeu_ps(y, vy_lo);
+ vy_lo = _mm256_extractf128_ps(vy, 1);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ _mm_storel_pi((__m64*) y, vy_lo);
+ vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ _mm_store_ss(y, vy_lo);
+ }
+ }
+}
diff --git a/src/qu8-f32-vcvt/gen/vcvt-avx2-x24.c b/src/qu8-f32-vcvt/gen/vcvt-avx2-x24.c
new file mode 100644
index 0000000..5ef479f
--- /dev/null
+++ b/src/qu8-f32-vcvt/gen/vcvt-avx2-x24.c
@@ -0,0 +1,91 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-f32-vcvt/avx2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qu8_f32_vcvt_ukernel__avx2_x24(
+ size_t n,
+ const uint8_t* x,
+ float* y,
+ const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(uint8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ for (; n >= 24 * sizeof(uint8_t); n -= 24 * sizeof(uint8_t)) {
+ __m256i vx01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ __m256i vx89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (x + 8)));
+ __m256i vxGHIJKLMN = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (x + 16)));
+ x += 24;
+
+ vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
+ vx89ABCDEF = _mm256_add_epi32(vx89ABCDEF, vminus_zero_point);
+ vxGHIJKLMN = _mm256_add_epi32(vxGHIJKLMN, vminus_zero_point);
+
+ __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
+ __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
+ __m256 vyGHIJKLMN = _mm256_cvtepi32_ps(vxGHIJKLMN);
+
+ vy01234567 = _mm256_mul_ps(vy01234567, vscale);
+ vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
+ vyGHIJKLMN = _mm256_mul_ps(vyGHIJKLMN, vscale);
+
+ _mm256_storeu_ps(y, vy01234567);
+ _mm256_storeu_ps(y + 8, vy89ABCDEF);
+ _mm256_storeu_ps(y + 16, vyGHIJKLMN);
+ y += 24;
+ }
+ for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+ __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ vx = _mm256_add_epi32(vx, vminus_zero_point);
+ x += 8;
+
+ __m256 vy = _mm256_cvtepi32_ps(vx);
+ vy = _mm256_mul_ps(vy, vscale);
+
+ _mm256_storeu_ps(y, vy);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(uint8_t));
+ assert(n <= 7 * sizeof(uint8_t));
+
+ __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ vx = _mm256_add_epi32(vx, vminus_zero_point);
+
+ __m256 vy = _mm256_cvtepi32_ps(vx);
+ vy = _mm256_mul_ps(vy, vscale);
+
+ __m128 vy_lo = _mm256_castps256_ps128(vy);
+ if (n & (4 * sizeof(uint8_t))) {
+ _mm_storeu_ps(y, vy_lo);
+ vy_lo = _mm256_extractf128_ps(vy, 1);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ _mm_storel_pi((__m64*) y, vy_lo);
+ vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ _mm_store_ss(y, vy_lo);
+ }
+ }
+}
diff --git a/src/qu8-f32-vcvt/gen/vcvt-avx2-x32.c b/src/qu8-f32-vcvt/gen/vcvt-avx2-x32.c
new file mode 100644
index 0000000..90540d6
--- /dev/null
+++ b/src/qu8-f32-vcvt/gen/vcvt-avx2-x32.c
@@ -0,0 +1,96 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-f32-vcvt/avx2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qu8_f32_vcvt_ukernel__avx2_x32(
+ size_t n,
+ const uint8_t* x,
+ float* y,
+ const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(uint8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
+ __m256i vx01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ __m256i vx89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (x + 8)));
+ __m256i vxGHIJKLMN = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (x + 16)));
+ __m256i vxOPQRSTUV = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (x + 24)));
+ x += 32;
+
+ vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
+ vx89ABCDEF = _mm256_add_epi32(vx89ABCDEF, vminus_zero_point);
+ vxGHIJKLMN = _mm256_add_epi32(vxGHIJKLMN, vminus_zero_point);
+ vxOPQRSTUV = _mm256_add_epi32(vxOPQRSTUV, vminus_zero_point);
+
+ __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
+ __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
+ __m256 vyGHIJKLMN = _mm256_cvtepi32_ps(vxGHIJKLMN);
+ __m256 vyOPQRSTUV = _mm256_cvtepi32_ps(vxOPQRSTUV);
+
+ vy01234567 = _mm256_mul_ps(vy01234567, vscale);
+ vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
+ vyGHIJKLMN = _mm256_mul_ps(vyGHIJKLMN, vscale);
+ vyOPQRSTUV = _mm256_mul_ps(vyOPQRSTUV, vscale);
+
+ _mm256_storeu_ps(y, vy01234567);
+ _mm256_storeu_ps(y + 8, vy89ABCDEF);
+ _mm256_storeu_ps(y + 16, vyGHIJKLMN);
+ _mm256_storeu_ps(y + 24, vyOPQRSTUV);
+ y += 32;
+ }
+ for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+ __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ vx = _mm256_add_epi32(vx, vminus_zero_point);
+ x += 8;
+
+ __m256 vy = _mm256_cvtepi32_ps(vx);
+ vy = _mm256_mul_ps(vy, vscale);
+
+ _mm256_storeu_ps(y, vy);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(uint8_t));
+ assert(n <= 7 * sizeof(uint8_t));
+
+ __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ vx = _mm256_add_epi32(vx, vminus_zero_point);
+
+ __m256 vy = _mm256_cvtepi32_ps(vx);
+ vy = _mm256_mul_ps(vy, vscale);
+
+ __m128 vy_lo = _mm256_castps256_ps128(vy);
+ if (n & (4 * sizeof(uint8_t))) {
+ _mm_storeu_ps(y, vy_lo);
+ vy_lo = _mm256_extractf128_ps(vy, 1);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ _mm_storel_pi((__m64*) y, vy_lo);
+ vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ _mm_store_ss(y, vy_lo);
+ }
+ }
+}
diff --git a/src/qu8-f32-vcvt/gen/vcvt-avx2-x8.c b/src/qu8-f32-vcvt/gen/vcvt-avx2-x8.c
new file mode 100644
index 0000000..48a0792
--- /dev/null
+++ b/src/qu8-f32-vcvt/gen/vcvt-avx2-x8.c
@@ -0,0 +1,81 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-f32-vcvt/avx2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qu8_f32_vcvt_ukernel__avx2_x8(
+ size_t n,
+ const uint8_t* x,
+ float* y,
+ const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(uint8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+ __m256i vx01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ x += 8;
+
+ vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
+
+ __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
+
+ vy01234567 = _mm256_mul_ps(vy01234567, vscale);
+
+ _mm256_storeu_ps(y, vy01234567);
+ y += 8;
+ }
+ for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+ __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ vx = _mm256_add_epi32(vx, vminus_zero_point);
+ x += 8;
+
+ __m256 vy = _mm256_cvtepi32_ps(vx);
+ vy = _mm256_mul_ps(vy, vscale);
+
+ _mm256_storeu_ps(y, vy);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(uint8_t));
+ assert(n <= 7 * sizeof(uint8_t));
+
+ __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+ vx = _mm256_add_epi32(vx, vminus_zero_point);
+
+ __m256 vy = _mm256_cvtepi32_ps(vx);
+ vy = _mm256_mul_ps(vy, vscale);
+
+ __m128 vy_lo = _mm256_castps256_ps128(vy);
+ if (n & (4 * sizeof(uint8_t))) {
+ _mm_storeu_ps(y, vy_lo);
+ vy_lo = _mm256_extractf128_ps(vy, 1);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ _mm_storel_pi((__m64*) y, vy_lo);
+ vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ _mm_store_ss(y, vy_lo);
+ }
+ }
+}
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 5dba744..26da611 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -1038,7 +1038,7 @@
XNN_ALIGN(16) float scale[4];
} sse4;
struct {
- XNN_ALIGN(16) int32_t minus_zero_point[4];
+ XNN_ALIGN(32) int32_t minus_zero_point[8];
XNN_ALIGN(32) float scale[8];
} avx;
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -1072,7 +1072,7 @@
XNN_ALIGN(16) float scale[4];
} sse4;
struct {
- XNN_ALIGN(16) int32_t minus_zero_point[4];
+ XNN_ALIGN(32) int32_t minus_zero_point[8];
XNN_ALIGN(32) float scale[8];
} avx;
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/xnnpack/vcvt.h b/src/xnnpack/vcvt.h
index c9ffd83..cfc1614 100644
--- a/src/xnnpack/vcvt.h
+++ b/src/xnnpack/vcvt.h
@@ -298,6 +298,11 @@
DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx_x24)
DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx_x32)
+DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx2_x8)
+DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx2_x16)
+DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx2_x24)
+DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx2_x32)
+
DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__wasmsimd_x8)
DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__wasmsimd_x16)
DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__wasmsimd_x24)
@@ -336,6 +341,11 @@
DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx_x24)
DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx_x32)
+DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx2_x8)
+DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx2_x16)
+DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx2_x24)
+DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx2_x32)
+
DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__wasmsimd_x8)
DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__wasmsimd_x16)
DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__wasmsimd_x24)
diff --git a/test/qs8-f32-vcvt.cc b/test/qs8-f32-vcvt.cc
index a6e54cc..6cb814f 100644
--- a/test/qs8-f32-vcvt.cc
+++ b/test/qs8-f32-vcvt.cc
@@ -961,6 +961,242 @@
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_F32_VCVT__AVX2_X8, batch_eq_8) {
+ TEST_REQUIRES_X86_AVX2;
+ VCvtMicrokernelTester()
+ .batch_size(8)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x8, xnn_init_qs8_f32_cvt_avx_params);
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X8, batch_div_8) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x8, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X8, batch_lt_8) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x8, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X8, batch_gt_8) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x8, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X8, scale) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x8, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X8, zero_point) {
+ TEST_REQUIRES_X86_AVX2;
+ for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .zero_point(zero_point)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x8, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_F32_VCVT__AVX2_X16, batch_eq_16) {
+ TEST_REQUIRES_X86_AVX2;
+ VCvtMicrokernelTester()
+ .batch_size(16)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x16, xnn_init_qs8_f32_cvt_avx_params);
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X16, batch_div_16) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x16, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X16, batch_lt_16) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x16, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X16, batch_gt_16) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x16, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X16, scale) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x16, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X16, zero_point) {
+ TEST_REQUIRES_X86_AVX2;
+ for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .zero_point(zero_point)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x16, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_F32_VCVT__AVX2_X24, batch_eq_24) {
+ TEST_REQUIRES_X86_AVX2;
+ VCvtMicrokernelTester()
+ .batch_size(24)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x24, xnn_init_qs8_f32_cvt_avx_params);
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X24, batch_div_24) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x24, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X24, batch_lt_24) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size < 24; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x24, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X24, batch_gt_24) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 25; batch_size < 48; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x24, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X24, scale) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x24, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X24, zero_point) {
+ TEST_REQUIRES_X86_AVX2;
+ for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .zero_point(zero_point)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x24, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_F32_VCVT__AVX2_X32, batch_eq_32) {
+ TEST_REQUIRES_X86_AVX2;
+ VCvtMicrokernelTester()
+ .batch_size(32)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x32, xnn_init_qs8_f32_cvt_avx_params);
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X32, batch_div_32) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x32, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X32, batch_lt_32) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x32, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X32, batch_gt_32) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x32, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X32, scale) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x32, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QS8_F32_VCVT__AVX2_X32, zero_point) {
+ TEST_REQUIRES_X86_AVX2;
+ for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .zero_point(zero_point)
+ .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x32, xnn_init_qs8_f32_cvt_avx_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
#if XNN_ARCH_WASMSIMD
TEST(QS8_F32_VCVT__WASMSIMD_X8, batch_eq_8) {
VCvtMicrokernelTester()
diff --git a/test/qs8-f32-vcvt.yaml b/test/qs8-f32-vcvt.yaml
index 0d44707..e40a9af 100644
--- a/test/qs8-f32-vcvt.yaml
+++ b/test/qs8-f32-vcvt.yaml
@@ -35,6 +35,14 @@
init: xnn_init_qs8_f32_cvt_avx_params
- name: xnn_qs8_f32_vcvt_ukernel__avx_x32
init: xnn_init_qs8_f32_cvt_avx_params
+- name: xnn_qs8_f32_vcvt_ukernel__avx2_x8
+ init: xnn_init_qs8_f32_cvt_avx_params
+- name: xnn_qs8_f32_vcvt_ukernel__avx2_x16
+ init: xnn_init_qs8_f32_cvt_avx_params
+- name: xnn_qs8_f32_vcvt_ukernel__avx2_x24
+ init: xnn_init_qs8_f32_cvt_avx_params
+- name: xnn_qs8_f32_vcvt_ukernel__avx2_x32
+ init: xnn_init_qs8_f32_cvt_avx_params
- name: xnn_qs8_f32_vcvt_ukernel__wasmsimd_x8
init: xnn_init_qs8_f32_cvt_wasmsimd_params
- name: xnn_qs8_f32_vcvt_ukernel__wasmsimd_x16
diff --git a/test/qu8-f32-vcvt.cc b/test/qu8-f32-vcvt.cc
index 95eac8f..058f687 100644
--- a/test/qu8-f32-vcvt.cc
+++ b/test/qu8-f32-vcvt.cc
@@ -961,6 +961,242 @@
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QU8_F32_VCVT__AVX2_X8, batch_eq_8) {
+ TEST_REQUIRES_X86_AVX2;
+ VCvtMicrokernelTester()
+ .batch_size(8)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x8, xnn_init_qu8_f32_cvt_avx_params);
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X8, batch_div_8) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x8, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X8, batch_lt_8) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x8, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X8, batch_gt_8) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x8, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X8, scale) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x8, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X8, zero_point) {
+ TEST_REQUIRES_X86_AVX2;
+ for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .zero_point(zero_point)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x8, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QU8_F32_VCVT__AVX2_X16, batch_eq_16) {
+ TEST_REQUIRES_X86_AVX2;
+ VCvtMicrokernelTester()
+ .batch_size(16)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x16, xnn_init_qu8_f32_cvt_avx_params);
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X16, batch_div_16) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x16, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X16, batch_lt_16) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x16, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X16, batch_gt_16) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x16, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X16, scale) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x16, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X16, zero_point) {
+ TEST_REQUIRES_X86_AVX2;
+ for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .zero_point(zero_point)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x16, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QU8_F32_VCVT__AVX2_X24, batch_eq_24) {
+ TEST_REQUIRES_X86_AVX2;
+ VCvtMicrokernelTester()
+ .batch_size(24)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x24, xnn_init_qu8_f32_cvt_avx_params);
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X24, batch_div_24) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x24, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X24, batch_lt_24) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size < 24; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x24, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X24, batch_gt_24) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 25; batch_size < 48; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x24, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X24, scale) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x24, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X24, zero_point) {
+ TEST_REQUIRES_X86_AVX2;
+ for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .zero_point(zero_point)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x24, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QU8_F32_VCVT__AVX2_X32, batch_eq_32) {
+ TEST_REQUIRES_X86_AVX2;
+ VCvtMicrokernelTester()
+ .batch_size(32)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x32, xnn_init_qu8_f32_cvt_avx_params);
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X32, batch_div_32) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x32, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X32, batch_lt_32) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x32, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X32, batch_gt_32) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x32, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X32, scale) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x32, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+
+ TEST(QU8_F32_VCVT__AVX2_X32, zero_point) {
+ TEST_REQUIRES_X86_AVX2;
+ for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .zero_point(zero_point)
+ .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x32, xnn_init_qu8_f32_cvt_avx_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
#if XNN_ARCH_WASMSIMD
TEST(QU8_F32_VCVT__WASMSIMD_X8, batch_eq_8) {
VCvtMicrokernelTester()
diff --git a/test/qu8-f32-vcvt.yaml b/test/qu8-f32-vcvt.yaml
index 716a498..02d69f2 100644
--- a/test/qu8-f32-vcvt.yaml
+++ b/test/qu8-f32-vcvt.yaml
@@ -35,6 +35,14 @@
init: xnn_init_qu8_f32_cvt_avx_params
- name: xnn_qu8_f32_vcvt_ukernel__avx_x32
init: xnn_init_qu8_f32_cvt_avx_params
+- name: xnn_qu8_f32_vcvt_ukernel__avx2_x8
+ init: xnn_init_qu8_f32_cvt_avx_params
+- name: xnn_qu8_f32_vcvt_ukernel__avx2_x16
+ init: xnn_init_qu8_f32_cvt_avx_params
+- name: xnn_qu8_f32_vcvt_ukernel__avx2_x24
+ init: xnn_init_qu8_f32_cvt_avx_params
+- name: xnn_qu8_f32_vcvt_ukernel__avx2_x32
+ init: xnn_init_qu8_f32_cvt_avx_params
- name: xnn_qu8_f32_vcvt_ukernel__wasmsimd_x8
init: xnn_init_qu8_f32_cvt_wasmsimd_params
- name: xnn_qu8_f32_vcvt_ukernel__wasmsimd_x16