AVX F32->QS8 and F32->QU8 VCVT microkernels
PiperOrigin-RevId: 416426700
diff --git a/BUILD.bazel b/BUILD.bazel
index 940277d..e53b44b 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -4968,6 +4968,8 @@
"src/f32-igemm/gen/1x16-minmax-avx-broadcast.c",
"src/f32-igemm/gen/5x16-minmax-avx-broadcast.c",
"src/f32-prelu/gen/avx-2x16.c",
+ "src/f32-qs8-vcvt/gen/vcvt-avx-x32.c",
+ "src/f32-qu8-vcvt/gen/vcvt-avx-x32.c",
"src/f32-vbinary/gen/vadd-minmax-avx-x16.c",
"src/f32-vbinary/gen/vaddc-minmax-avx-x16.c",
"src/f32-vbinary/gen/vdiv-minmax-avx-x16.c",
@@ -5084,6 +5086,14 @@
"src/f32-igemm/gen/7x8-minmax-avx-broadcast.c",
"src/f32-prelu/gen/avx-2x8.c",
"src/f32-prelu/gen/avx-2x16.c",
+ "src/f32-qs8-vcvt/gen/vcvt-avx-x8.c",
+ "src/f32-qs8-vcvt/gen/vcvt-avx-x16.c",
+ "src/f32-qs8-vcvt/gen/vcvt-avx-x24.c",
+ "src/f32-qs8-vcvt/gen/vcvt-avx-x32.c",
+ "src/f32-qu8-vcvt/gen/vcvt-avx-x8.c",
+ "src/f32-qu8-vcvt/gen/vcvt-avx-x16.c",
+ "src/f32-qu8-vcvt/gen/vcvt-avx-x24.c",
+ "src/f32-qu8-vcvt/gen/vcvt-avx-x32.c",
"src/f32-rmax/avx.c",
"src/f32-vbinary/gen/vadd-minmax-avx-x8.c",
"src/f32-vbinary/gen/vadd-minmax-avx-x16.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9176b70..bc1deed 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3906,6 +3906,8 @@
src/f32-igemm/gen/1x16-minmax-avx-broadcast.c
src/f32-igemm/gen/5x16-minmax-avx-broadcast.c
src/f32-prelu/gen/avx-2x16.c
+ src/f32-qs8-vcvt/gen/vcvt-avx-x32.c
+ src/f32-qu8-vcvt/gen/vcvt-avx-x32.c
src/f32-vbinary/gen/vadd-minmax-avx-x16.c
src/f32-vbinary/gen/vaddc-minmax-avx-x16.c
src/f32-vbinary/gen/vdiv-minmax-avx-x16.c
@@ -4021,6 +4023,14 @@
src/f32-igemm/gen/7x8-minmax-avx-broadcast.c
src/f32-prelu/gen/avx-2x8.c
src/f32-prelu/gen/avx-2x16.c
+ src/f32-qs8-vcvt/gen/vcvt-avx-x8.c
+ src/f32-qs8-vcvt/gen/vcvt-avx-x16.c
+ src/f32-qs8-vcvt/gen/vcvt-avx-x24.c
+ src/f32-qs8-vcvt/gen/vcvt-avx-x32.c
+ src/f32-qu8-vcvt/gen/vcvt-avx-x8.c
+ src/f32-qu8-vcvt/gen/vcvt-avx-x16.c
+ src/f32-qu8-vcvt/gen/vcvt-avx-x24.c
+ src/f32-qu8-vcvt/gen/vcvt-avx-x32.c
src/f32-rmax/avx.c
src/f32-vbinary/gen/vadd-minmax-avx-x8.c
src/f32-vbinary/gen/vadd-minmax-avx-x16.c
diff --git a/bench/f32-qs8-vcvt.cc b/bench/f32-qs8-vcvt.cc
index eb9fff3..95e296d 100644
--- a/bench/f32-qs8-vcvt.cc
+++ b/bench/f32-qs8-vcvt.cc
@@ -118,6 +118,31 @@
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ BENCHMARK_CAPTURE(f32_qs8_vcvt, avx_x8,
+ xnn_f32_qs8_vcvt_ukernel__avx_x8,
+ xnn_init_f32_qs8_cvt_avx_params,
+ benchmark::utils::CheckAVX)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qs8_vcvt, avx_x16,
+ xnn_f32_qs8_vcvt_ukernel__avx_x16,
+ xnn_init_f32_qs8_cvt_avx_params,
+ benchmark::utils::CheckAVX)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qs8_vcvt, avx_x24,
+ xnn_f32_qs8_vcvt_ukernel__avx_x24,
+ xnn_init_f32_qs8_cvt_avx_params,
+ benchmark::utils::CheckAVX)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qs8_vcvt, avx_x32,
+ xnn_f32_qs8_vcvt_ukernel__avx_x32,
+ xnn_init_f32_qs8_cvt_avx_params,
+ benchmark::utils::CheckAVX)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+
BENCHMARK_CAPTURE(f32_qs8_vcvt, sse41_x8,
xnn_f32_qs8_vcvt_ukernel__sse41_x8,
xnn_init_f32_qs8_cvt_sse4_params,
diff --git a/bench/f32-qu8-vcvt.cc b/bench/f32-qu8-vcvt.cc
index 187d811..3f7f2c2 100644
--- a/bench/f32-qu8-vcvt.cc
+++ b/bench/f32-qu8-vcvt.cc
@@ -118,6 +118,31 @@
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, avx_x8,
+ xnn_f32_qu8_vcvt_ukernel__avx_x8,
+ xnn_init_f32_qu8_cvt_avx_params,
+ benchmark::utils::CheckAVX)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, avx_x16,
+ xnn_f32_qu8_vcvt_ukernel__avx_x16,
+ xnn_init_f32_qu8_cvt_avx_params,
+ benchmark::utils::CheckAVX)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, avx_x24,
+ xnn_f32_qu8_vcvt_ukernel__avx_x24,
+ xnn_init_f32_qu8_cvt_avx_params,
+ benchmark::utils::CheckAVX)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, avx_x32,
+ xnn_f32_qu8_vcvt_ukernel__avx_x32,
+ xnn_init_f32_qu8_cvt_avx_params,
+ benchmark::utils::CheckAVX)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+
BENCHMARK_CAPTURE(f32_qu8_vcvt, sse2_x8,
xnn_f32_qu8_vcvt_ukernel__sse2_x8,
xnn_init_f32_qu8_cvt_sse2_params)
diff --git a/scripts/generate-f32-qs8-vcvt.sh b/scripts/generate-f32-qs8-vcvt.sh
index 7f31123..0c80555 100755
--- a/scripts/generate-f32-qs8-vcvt.sh
+++ b/scripts/generate-f32-qs8-vcvt.sh
@@ -41,6 +41,17 @@
tools/xngen src/f32-qs8-vcvt/sse.c.in -D SSE=2 -D BATCH_TILE=24 -D DATATYPE=QU8 -o src/f32-qu8-vcvt/gen/vcvt-sse2-x24.c &
tools/xngen src/f32-qs8-vcvt/sse.c.in -D SSE=2 -D BATCH_TILE=32 -D DATATYPE=QU8 -o src/f32-qu8-vcvt/gen/vcvt-sse2-x32.c &
+################################# x86 256-bit #################################
+tools/xngen src/f32-qs8-vcvt/avx.c.in -D BATCH_TILE=8 -D DATATYPE=QS8 -o src/f32-qs8-vcvt/gen/vcvt-avx-x8.c &
+tools/xngen src/f32-qs8-vcvt/avx.c.in -D BATCH_TILE=16 -D DATATYPE=QS8 -o src/f32-qs8-vcvt/gen/vcvt-avx-x16.c &
+tools/xngen src/f32-qs8-vcvt/avx.c.in -D BATCH_TILE=24 -D DATATYPE=QS8 -o src/f32-qs8-vcvt/gen/vcvt-avx-x24.c &
+tools/xngen src/f32-qs8-vcvt/avx.c.in -D BATCH_TILE=32 -D DATATYPE=QS8 -o src/f32-qs8-vcvt/gen/vcvt-avx-x32.c &
+
+tools/xngen src/f32-qs8-vcvt/avx.c.in -D BATCH_TILE=8 -D DATATYPE=QU8 -o src/f32-qu8-vcvt/gen/vcvt-avx-x8.c &
+tools/xngen src/f32-qs8-vcvt/avx.c.in -D BATCH_TILE=16 -D DATATYPE=QU8 -o src/f32-qu8-vcvt/gen/vcvt-avx-x16.c &
+tools/xngen src/f32-qs8-vcvt/avx.c.in -D BATCH_TILE=24 -D DATATYPE=QU8 -o src/f32-qu8-vcvt/gen/vcvt-avx-x24.c &
+tools/xngen src/f32-qs8-vcvt/avx.c.in -D BATCH_TILE=32 -D DATATYPE=QU8 -o src/f32-qu8-vcvt/gen/vcvt-avx-x32.c &
+
################################## WAsm SIMD ##################################
tools/xngen src/f32-qs8-vcvt/wasmsimd-cvt.c.in -D BATCH_TILE=8 -D DATATYPE=QS8 -o src/f32-qs8-vcvt/gen/vcvt-wasmsimd-cvt-x8.c &
tools/xngen src/f32-qs8-vcvt/wasmsimd-cvt.c.in -D BATCH_TILE=16 -D DATATYPE=QS8 -o src/f32-qs8-vcvt/gen/vcvt-wasmsimd-cvt-x16.c &
diff --git a/src/f32-qs8-vcvt/avx.c.in b/src/f32-qs8-vcvt/avx.c.in
new file mode 100644
index 0000000..66d9e2d
--- /dev/null
+++ b/src/f32-qs8-vcvt/avx.c.in
@@ -0,0 +1,125 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert DATATYPE in ["QS8", "QU8"]
+$assert BATCH_TILE % 8 == 0
+$assert BATCH_TILE >= 8
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
+$_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE]
+$_MM_MAX_EPX8 = {"QS8": "_mm_max_epi8", "QU8": "_mm_max_epu8"}[DATATYPE]
+void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx_x${BATCH_TILE}(
+ size_t n,
+ const float* x,
+ ${XINT8_T}* y,
+ const union xnn_f32_${DATATYPE.lower()}_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(float) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
+
+ $if BATCH_TILE > 8:
+ for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
+ __m256 vx${ABC[0:8]} = _mm256_loadu_ps(x);
+ $for N in range(8, BATCH_TILE, 8):
+ __m256 vx${ABC[N:N+8]} = _mm256_loadu_ps(x + ${N});
+ x += ${BATCH_TILE};
+
+ $for N in range(0, BATCH_TILE, 8):
+ vx${ABC[N:N+8]} = _mm256_mul_ps(vx${ABC[N:N+8]}, vscale);
+
+ $for N in range(0, BATCH_TILE, 8):
+ vx${ABC[N:N+8]} = _mm256_min_ps(vx${ABC[N:N+8]}, voutput_max_less_zero_point);
+
+ $for N in range(0, BATCH_TILE, 8):
+ const __m256i vacc${ABC[N:N+8]} = _mm256_cvtps_epi32(vx${ABC[N:N+8]});
+
+ $for N in range(0, BATCH_TILE, 8):
+ __m128i vy${ABC[N:N+8]} = _mm_packs_epi32(_mm256_castsi256_si128(vacc${ABC[N:N+8]}), _mm256_extractf128_si256(vacc${ABC[N:N+8]}, 1));
+
+ $for N in range(0, BATCH_TILE, 8):
+ vy${ABC[N:N+8]} = _mm_adds_epi16(vy${ABC[N:N+8]}, voutput_zero_point);
+
+ $for N in range(0, BATCH_TILE, 16):
+ $if N + 8 < BATCH_TILE:
+ __m128i vy${ABC[N:N+16]} = ${_MM_PACKXS_EPI16}(vy${ABC[N:N+8]}, vy${ABC[N+8:N+16]});
+ $else:
+ vy${ABC[N:N+8]} = ${_MM_PACKXS_EPI16}(vy${ABC[N:N+8]}, vy${ABC[N:N+8]});
+
+ $for N in range(0, BATCH_TILE, 16):
+ $if N + 8 < BATCH_TILE:
+ vy${ABC[N:N+16]} = ${_MM_MAX_EPX8}(vy${ABC[N:N+16]}, voutput_min);
+ $else:
+ vy${ABC[N:N+8]} = ${_MM_MAX_EPX8}(vy${ABC[N:N+8]}, voutput_min);
+
+ _mm_storeu_si128((__m128i*) y, vy${ABC[0:16]});
+ $for N in range(16, BATCH_TILE, 16):
+ $if N + 8 < BATCH_TILE:
+ _mm_storeu_si128((__m128i*) (y + ${N}), vy${ABC[N:N+16]});
+ $else:
+ _mm_storel_epi64((__m128i*) (y + ${N}), vy${ABC[N:N+8]});
+ y += ${BATCH_TILE};
+ }
+ for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+ __m256 vx = _mm256_loadu_ps(x);
+ vx = _mm256_mul_ps(vx, vscale);
+ vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+ x += 8;
+
+ const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+ __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+ vy = _mm_adds_epi16(vy, voutput_zero_point);
+ vy = ${_MM_PACKXS_EPI16}(vy, vy);
+ vy = ${_MM_MAX_EPX8}(vy, voutput_min);
+
+ _mm_storel_epi64((__m128i*) y, vy);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(float));
+ assert(n <= 7 * sizeof(float));
+ __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
+
+ __m256 vx = _mm256_maskload_ps(x, vmask);
+ vx = _mm256_mul_ps(vx, vscale);
+ vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+
+ const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+ __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+ vy = _mm_adds_epi16(vy, voutput_zero_point);
+ vy = ${_MM_PACKXS_EPI16}(vy, vy);
+ vy = ${_MM_MAX_EPX8}(vy, voutput_min);
+
+ if (n & (4 * sizeof(float))) {
+ *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+ y += 4;
+ vy = _mm_srli_epi64(vy, 32);
+ }
+ if (n & (2 * sizeof(float))) {
+ *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+ y += 2;
+ vy = _mm_srli_epi32(vy, 16);
+ }
+ if (n & (1 * sizeof(float))) {
+ *y = (${XINT8_T}) _mm_extract_epi8(vy, 0);
+ }
+ }
+}
diff --git a/src/f32-qs8-vcvt/gen/vcvt-avx-x16.c b/src/f32-qs8-vcvt/gen/vcvt-avx-x16.c
new file mode 100644
index 0000000..eae490f
--- /dev/null
+++ b/src/f32-qs8-vcvt/gen/vcvt-avx-x16.c
@@ -0,0 +1,107 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-qs8-vcvt/avx.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_f32_qs8_vcvt_ukernel__avx_x16(
+ size_t n,
+ const float* x,
+ int8_t* y,
+ const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(float) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
+
+ for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+ __m256 vx01234567 = _mm256_loadu_ps(x);
+ __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
+ x += 16;
+
+ vx01234567 = _mm256_mul_ps(vx01234567, vscale);
+ vx89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vscale);
+
+ vx01234567 = _mm256_min_ps(vx01234567, voutput_max_less_zero_point);
+ vx89ABCDEF = _mm256_min_ps(vx89ABCDEF, voutput_max_less_zero_point);
+
+ const __m256i vacc01234567 = _mm256_cvtps_epi32(vx01234567);
+ const __m256i vacc89ABCDEF = _mm256_cvtps_epi32(vx89ABCDEF);
+
+ __m128i vy01234567 = _mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extractf128_si256(vacc01234567, 1));
+ __m128i vy89ABCDEF = _mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extractf128_si256(vacc89ABCDEF, 1));
+
+ vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
+ vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
+
+ __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
+
+ vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min);
+
+ _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
+ y += 16;
+ }
+ for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+ __m256 vx = _mm256_loadu_ps(x);
+ vx = _mm256_mul_ps(vx, vscale);
+ vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+ x += 8;
+
+ const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+ __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+ vy = _mm_adds_epi16(vy, voutput_zero_point);
+ vy = _mm_packs_epi16(vy, vy);
+ vy = _mm_max_epi8(vy, voutput_min);
+
+ _mm_storel_epi64((__m128i*) y, vy);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(float));
+ assert(n <= 7 * sizeof(float));
+ __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
+
+ __m256 vx = _mm256_maskload_ps(x, vmask);
+ vx = _mm256_mul_ps(vx, vscale);
+ vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+
+ const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+ __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+ vy = _mm_adds_epi16(vy, voutput_zero_point);
+ vy = _mm_packs_epi16(vy, vy);
+ vy = _mm_max_epi8(vy, voutput_min);
+
+ if (n & (4 * sizeof(float))) {
+ *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+ y += 4;
+ vy = _mm_srli_epi64(vy, 32);
+ }
+ if (n & (2 * sizeof(float))) {
+ *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+ y += 2;
+ vy = _mm_srli_epi32(vy, 16);
+ }
+ if (n & (1 * sizeof(float))) {
+ *y = (int8_t) _mm_extract_epi8(vy, 0);
+ }
+ }
+}
diff --git a/src/f32-qs8-vcvt/gen/vcvt-avx-x24.c b/src/f32-qs8-vcvt/gen/vcvt-avx-x24.c
new file mode 100644
index 0000000..f270737
--- /dev/null
+++ b/src/f32-qs8-vcvt/gen/vcvt-avx-x24.c
@@ -0,0 +1,116 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-qs8-vcvt/avx.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_f32_qs8_vcvt_ukernel__avx_x24(
+ size_t n,
+ const float* x,
+ int8_t* y,
+ const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(float) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
+
+ for (; n >= 24 * sizeof(float); n -= 24 * sizeof(float)) {
+ __m256 vx01234567 = _mm256_loadu_ps(x);
+ __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
+ __m256 vxGHIJKLMN = _mm256_loadu_ps(x + 16);
+ x += 24;
+
+ vx01234567 = _mm256_mul_ps(vx01234567, vscale);
+ vx89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vscale);
+ vxGHIJKLMN = _mm256_mul_ps(vxGHIJKLMN, vscale);
+
+ vx01234567 = _mm256_min_ps(vx01234567, voutput_max_less_zero_point);
+ vx89ABCDEF = _mm256_min_ps(vx89ABCDEF, voutput_max_less_zero_point);
+ vxGHIJKLMN = _mm256_min_ps(vxGHIJKLMN, voutput_max_less_zero_point);
+
+ const __m256i vacc01234567 = _mm256_cvtps_epi32(vx01234567);
+ const __m256i vacc89ABCDEF = _mm256_cvtps_epi32(vx89ABCDEF);
+ const __m256i vaccGHIJKLMN = _mm256_cvtps_epi32(vxGHIJKLMN);
+
+ __m128i vy01234567 = _mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extractf128_si256(vacc01234567, 1));
+ __m128i vy89ABCDEF = _mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extractf128_si256(vacc89ABCDEF, 1));
+ __m128i vyGHIJKLMN = _mm_packs_epi32(_mm256_castsi256_si128(vaccGHIJKLMN), _mm256_extractf128_si256(vaccGHIJKLMN, 1));
+
+ vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
+ vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
+ vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
+
+ __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
+ vyGHIJKLMN = _mm_packs_epi16(vyGHIJKLMN, vyGHIJKLMN);
+
+ vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min);
+ vyGHIJKLMN = _mm_max_epi8(vyGHIJKLMN, voutput_min);
+
+ _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
+ _mm_storel_epi64((__m128i*) (y + 16), vyGHIJKLMN);
+ y += 24;
+ }
+ for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+ __m256 vx = _mm256_loadu_ps(x);
+ vx = _mm256_mul_ps(vx, vscale);
+ vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+ x += 8;
+
+ const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+ __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+ vy = _mm_adds_epi16(vy, voutput_zero_point);
+ vy = _mm_packs_epi16(vy, vy);
+ vy = _mm_max_epi8(vy, voutput_min);
+
+ _mm_storel_epi64((__m128i*) y, vy);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(float));
+ assert(n <= 7 * sizeof(float));
+ __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
+
+ __m256 vx = _mm256_maskload_ps(x, vmask);
+ vx = _mm256_mul_ps(vx, vscale);
+ vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+
+ const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+ __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+ vy = _mm_adds_epi16(vy, voutput_zero_point);
+ vy = _mm_packs_epi16(vy, vy);
+ vy = _mm_max_epi8(vy, voutput_min);
+
+ if (n & (4 * sizeof(float))) {
+ *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+ y += 4;
+ vy = _mm_srli_epi64(vy, 32);
+ }
+ if (n & (2 * sizeof(float))) {
+ *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+ y += 2;
+ vy = _mm_srli_epi32(vy, 16);
+ }
+ if (n & (1 * sizeof(float))) {
+ *y = (int8_t) _mm_extract_epi8(vy, 0);
+ }
+ }
+}
diff --git a/src/f32-qs8-vcvt/gen/vcvt-avx-x32.c b/src/f32-qs8-vcvt/gen/vcvt-avx-x32.c
new file mode 100644
index 0000000..5ea77e2
--- /dev/null
+++ b/src/f32-qs8-vcvt/gen/vcvt-avx-x32.c
@@ -0,0 +1,122 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-qs8-vcvt/avx.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_f32_qs8_vcvt_ukernel__avx_x32(
+ size_t n,
+ const float* x,
+ int8_t* y,
+ const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(float) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
+
+ for (; n >= 32 * sizeof(float); n -= 32 * sizeof(float)) {
+ __m256 vx01234567 = _mm256_loadu_ps(x);
+ __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
+ __m256 vxGHIJKLMN = _mm256_loadu_ps(x + 16);
+ __m256 vxOPQRSTUV = _mm256_loadu_ps(x + 24);
+ x += 32;
+
+ vx01234567 = _mm256_mul_ps(vx01234567, vscale);
+ vx89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vscale);
+ vxGHIJKLMN = _mm256_mul_ps(vxGHIJKLMN, vscale);
+ vxOPQRSTUV = _mm256_mul_ps(vxOPQRSTUV, vscale);
+
+ vx01234567 = _mm256_min_ps(vx01234567, voutput_max_less_zero_point);
+ vx89ABCDEF = _mm256_min_ps(vx89ABCDEF, voutput_max_less_zero_point);
+ vxGHIJKLMN = _mm256_min_ps(vxGHIJKLMN, voutput_max_less_zero_point);
+ vxOPQRSTUV = _mm256_min_ps(vxOPQRSTUV, voutput_max_less_zero_point);
+
+ const __m256i vacc01234567 = _mm256_cvtps_epi32(vx01234567);
+ const __m256i vacc89ABCDEF = _mm256_cvtps_epi32(vx89ABCDEF);
+ const __m256i vaccGHIJKLMN = _mm256_cvtps_epi32(vxGHIJKLMN);
+ const __m256i vaccOPQRSTUV = _mm256_cvtps_epi32(vxOPQRSTUV);
+
+ __m128i vy01234567 = _mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extractf128_si256(vacc01234567, 1));
+ __m128i vy89ABCDEF = _mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extractf128_si256(vacc89ABCDEF, 1));
+ __m128i vyGHIJKLMN = _mm_packs_epi32(_mm256_castsi256_si128(vaccGHIJKLMN), _mm256_extractf128_si256(vaccGHIJKLMN, 1));
+ __m128i vyOPQRSTUV = _mm_packs_epi32(_mm256_castsi256_si128(vaccOPQRSTUV), _mm256_extractf128_si256(vaccOPQRSTUV, 1));
+
+ vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
+ vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
+ vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
+ vyOPQRSTUV = _mm_adds_epi16(vyOPQRSTUV, voutput_zero_point);
+
+ __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
+ __m128i vyGHIJKLMNOPQRSTUV = _mm_packs_epi16(vyGHIJKLMN, vyOPQRSTUV);
+
+ vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min);
+ vyGHIJKLMNOPQRSTUV = _mm_max_epi8(vyGHIJKLMNOPQRSTUV, voutput_min);
+
+ _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
+ _mm_storeu_si128((__m128i*) (y + 16), vyGHIJKLMNOPQRSTUV);
+ y += 32;
+ }
+ for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+ __m256 vx = _mm256_loadu_ps(x);
+ vx = _mm256_mul_ps(vx, vscale);
+ vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+ x += 8;
+
+ const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+ __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+ vy = _mm_adds_epi16(vy, voutput_zero_point);
+ vy = _mm_packs_epi16(vy, vy);
+ vy = _mm_max_epi8(vy, voutput_min);
+
+ _mm_storel_epi64((__m128i*) y, vy);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(float));
+ assert(n <= 7 * sizeof(float));
+ __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
+
+ __m256 vx = _mm256_maskload_ps(x, vmask);
+ vx = _mm256_mul_ps(vx, vscale);
+ vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+
+ const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+ __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+ vy = _mm_adds_epi16(vy, voutput_zero_point);
+ vy = _mm_packs_epi16(vy, vy);
+ vy = _mm_max_epi8(vy, voutput_min);
+
+ if (n & (4 * sizeof(float))) {
+ *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+ y += 4;
+ vy = _mm_srli_epi64(vy, 32);
+ }
+ if (n & (2 * sizeof(float))) {
+ *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+ y += 2;
+ vy = _mm_srli_epi32(vy, 16);
+ }
+ if (n & (1 * sizeof(float))) {
+ *y = (int8_t) _mm_extract_epi8(vy, 0);
+ }
+ }
+}
diff --git a/src/f32-qs8-vcvt/gen/vcvt-avx-x8.c b/src/f32-qs8-vcvt/gen/vcvt-avx-x8.c
new file mode 100644
index 0000000..afa47f5
--- /dev/null
+++ b/src/f32-qs8-vcvt/gen/vcvt-avx-x8.c
@@ -0,0 +1,80 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-qs8-vcvt/avx.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_f32_qs8_vcvt_ukernel__avx_x8(
+ size_t n,
+ const float* x,
+ int8_t* y,
+ const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(float) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
+
+ for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+ __m256 vx = _mm256_loadu_ps(x);
+ vx = _mm256_mul_ps(vx, vscale);
+ vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+ x += 8;
+
+ const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+ __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+ vy = _mm_adds_epi16(vy, voutput_zero_point);
+ vy = _mm_packs_epi16(vy, vy);
+ vy = _mm_max_epi8(vy, voutput_min);
+
+ _mm_storel_epi64((__m128i*) y, vy);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(float));
+ assert(n <= 7 * sizeof(float));
+ __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
+
+ __m256 vx = _mm256_maskload_ps(x, vmask);
+ vx = _mm256_mul_ps(vx, vscale);
+ vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+
+ const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+ __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+ vy = _mm_adds_epi16(vy, voutput_zero_point);
+ vy = _mm_packs_epi16(vy, vy);
+ vy = _mm_max_epi8(vy, voutput_min);
+
+ if (n & (4 * sizeof(float))) {
+ *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+ y += 4;
+ vy = _mm_srli_epi64(vy, 32);
+ }
+ if (n & (2 * sizeof(float))) {
+ *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+ y += 2;
+ vy = _mm_srli_epi32(vy, 16);
+ }
+ if (n & (1 * sizeof(float))) {
+ *y = (int8_t) _mm_extract_epi8(vy, 0);
+ }
+ }
+}
diff --git a/src/f32-qs8-vcvt/gen/vcvt-sse41-x16.c b/src/f32-qs8-vcvt/gen/vcvt-sse41-x16.c
index 334d49f..1c66297 100644
--- a/src/f32-qs8-vcvt/gen/vcvt-sse41-x16.c
+++ b/src/f32-qs8-vcvt/gen/vcvt-sse41-x16.c
@@ -119,7 +119,7 @@
vy = _mm_srli_epi32(vy, 16);
}
if (n & (1 * sizeof(float))) {
- *y = (int8_t) _mm_extract_epi16(vy, 0);
+ *y = (int8_t) _mm_extract_epi8(vy, 0);
}
}
}
diff --git a/src/f32-qs8-vcvt/gen/vcvt-sse41-x24.c b/src/f32-qs8-vcvt/gen/vcvt-sse41-x24.c
index 29e2ef6..d2623e8 100644
--- a/src/f32-qs8-vcvt/gen/vcvt-sse41-x24.c
+++ b/src/f32-qs8-vcvt/gen/vcvt-sse41-x24.c
@@ -132,7 +132,7 @@
vy = _mm_srli_epi32(vy, 16);
}
if (n & (1 * sizeof(float))) {
- *y = (int8_t) _mm_extract_epi16(vy, 0);
+ *y = (int8_t) _mm_extract_epi8(vy, 0);
}
}
}
diff --git a/src/f32-qs8-vcvt/gen/vcvt-sse41-x32.c b/src/f32-qs8-vcvt/gen/vcvt-sse41-x32.c
index 258ba84..ee2c43d 100644
--- a/src/f32-qs8-vcvt/gen/vcvt-sse41-x32.c
+++ b/src/f32-qs8-vcvt/gen/vcvt-sse41-x32.c
@@ -142,7 +142,7 @@
vy = _mm_srli_epi32(vy, 16);
}
if (n & (1 * sizeof(float))) {
- *y = (int8_t) _mm_extract_epi16(vy, 0);
+ *y = (int8_t) _mm_extract_epi8(vy, 0);
}
}
}
diff --git a/src/f32-qs8-vcvt/gen/vcvt-sse41-x8.c b/src/f32-qs8-vcvt/gen/vcvt-sse41-x8.c
index 097e082..64d5ca6 100644
--- a/src/f32-qs8-vcvt/gen/vcvt-sse41-x8.c
+++ b/src/f32-qs8-vcvt/gen/vcvt-sse41-x8.c
@@ -83,7 +83,7 @@
vy = _mm_srli_epi32(vy, 16);
}
if (n & (1 * sizeof(float))) {
- *y = (int8_t) _mm_extract_epi16(vy, 0);
+ *y = (int8_t) _mm_extract_epi8(vy, 0);
}
}
}
diff --git a/src/f32-qs8-vcvt/sse.c.in b/src/f32-qs8-vcvt/sse.c.in
index 1f1f9b5..8d60b9f 100644
--- a/src/f32-qs8-vcvt/sse.c.in
+++ b/src/f32-qs8-vcvt/sse.c.in
@@ -143,7 +143,7 @@
vy = _mm_srli_epi32(vy, 16);
}
if (n & (1 * sizeof(float))) {
- *y = (${XINT8_T}) _mm_extract_epi16(vy, 0);
+ *y = (${XINT8_T}) _mm_extract_epi8(vy, 0);
}
$else:
{
diff --git a/src/f32-qu8-vcvt/gen/vcvt-avx-x16.c b/src/f32-qu8-vcvt/gen/vcvt-avx-x16.c
new file mode 100644
index 0000000..6fd36b8
--- /dev/null
+++ b/src/f32-qu8-vcvt/gen/vcvt-avx-x16.c
@@ -0,0 +1,107 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-qs8-vcvt/avx.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_f32_qu8_vcvt_ukernel__avx_x16(
+ size_t n,
+ const float* x,
+ uint8_t* y,
+ const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(float) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
+
+ for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+ __m256 vx01234567 = _mm256_loadu_ps(x);
+ __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
+ x += 16;
+
+ vx01234567 = _mm256_mul_ps(vx01234567, vscale);
+ vx89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vscale);
+
+ vx01234567 = _mm256_min_ps(vx01234567, voutput_max_less_zero_point);
+ vx89ABCDEF = _mm256_min_ps(vx89ABCDEF, voutput_max_less_zero_point);
+
+ const __m256i vacc01234567 = _mm256_cvtps_epi32(vx01234567);
+ const __m256i vacc89ABCDEF = _mm256_cvtps_epi32(vx89ABCDEF);
+
+ __m128i vy01234567 = _mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extractf128_si256(vacc01234567, 1));
+ __m128i vy89ABCDEF = _mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extractf128_si256(vacc89ABCDEF, 1));
+
+ vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
+ vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
+
+ __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF);
+
+ vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min);
+
+ _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
+ y += 16;
+ }
+ for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+ __m256 vx = _mm256_loadu_ps(x);
+ vx = _mm256_mul_ps(vx, vscale);
+ vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+ x += 8;
+
+ const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+ __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+ vy = _mm_adds_epi16(vy, voutput_zero_point);
+ vy = _mm_packus_epi16(vy, vy);
+ vy = _mm_max_epu8(vy, voutput_min);
+
+ _mm_storel_epi64((__m128i*) y, vy);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(float));
+ assert(n <= 7 * sizeof(float));
+ __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
+
+ __m256 vx = _mm256_maskload_ps(x, vmask);
+ vx = _mm256_mul_ps(vx, vscale);
+ vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+
+ const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+ __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+ vy = _mm_adds_epi16(vy, voutput_zero_point);
+ vy = _mm_packus_epi16(vy, vy);
+ vy = _mm_max_epu8(vy, voutput_min);
+
+ if (n & (4 * sizeof(float))) {
+ *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+ y += 4;
+ vy = _mm_srli_epi64(vy, 32);
+ }
+ if (n & (2 * sizeof(float))) {
+ *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+ y += 2;
+ vy = _mm_srli_epi32(vy, 16);
+ }
+ if (n & (1 * sizeof(float))) {
+ *y = (uint8_t) _mm_extract_epi8(vy, 0);
+ }
+ }
+}
diff --git a/src/f32-qu8-vcvt/gen/vcvt-avx-x24.c b/src/f32-qu8-vcvt/gen/vcvt-avx-x24.c
new file mode 100644
index 0000000..4790d23
--- /dev/null
+++ b/src/f32-qu8-vcvt/gen/vcvt-avx-x24.c
@@ -0,0 +1,116 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-qs8-vcvt/avx.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_f32_qu8_vcvt_ukernel__avx_x24(
+ size_t n,
+ const float* x,
+ uint8_t* y,
+ const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(float) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
+
+ for (; n >= 24 * sizeof(float); n -= 24 * sizeof(float)) {
+ __m256 vx01234567 = _mm256_loadu_ps(x);
+ __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
+ __m256 vxGHIJKLMN = _mm256_loadu_ps(x + 16);
+ x += 24;
+
+ vx01234567 = _mm256_mul_ps(vx01234567, vscale);
+ vx89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vscale);
+ vxGHIJKLMN = _mm256_mul_ps(vxGHIJKLMN, vscale);
+
+ vx01234567 = _mm256_min_ps(vx01234567, voutput_max_less_zero_point);
+ vx89ABCDEF = _mm256_min_ps(vx89ABCDEF, voutput_max_less_zero_point);
+ vxGHIJKLMN = _mm256_min_ps(vxGHIJKLMN, voutput_max_less_zero_point);
+
+ const __m256i vacc01234567 = _mm256_cvtps_epi32(vx01234567);
+ const __m256i vacc89ABCDEF = _mm256_cvtps_epi32(vx89ABCDEF);
+ const __m256i vaccGHIJKLMN = _mm256_cvtps_epi32(vxGHIJKLMN);
+
+ __m128i vy01234567 = _mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extractf128_si256(vacc01234567, 1));
+ __m128i vy89ABCDEF = _mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extractf128_si256(vacc89ABCDEF, 1));
+ __m128i vyGHIJKLMN = _mm_packs_epi32(_mm256_castsi256_si128(vaccGHIJKLMN), _mm256_extractf128_si256(vaccGHIJKLMN, 1));
+
+ vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
+ vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
+ vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
+
+ __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF);
+ vyGHIJKLMN = _mm_packus_epi16(vyGHIJKLMN, vyGHIJKLMN);
+
+ vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min);
+ vyGHIJKLMN = _mm_max_epu8(vyGHIJKLMN, voutput_min);
+
+ _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
+ _mm_storel_epi64((__m128i*) (y + 16), vyGHIJKLMN);
+ y += 24;
+ }
+ for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+ __m256 vx = _mm256_loadu_ps(x);
+ vx = _mm256_mul_ps(vx, vscale);
+ vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+ x += 8;
+
+ const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+ __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+ vy = _mm_adds_epi16(vy, voutput_zero_point);
+ vy = _mm_packus_epi16(vy, vy);
+ vy = _mm_max_epu8(vy, voutput_min);
+
+ _mm_storel_epi64((__m128i*) y, vy);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(float));
+ assert(n <= 7 * sizeof(float));
+ __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
+
+ __m256 vx = _mm256_maskload_ps(x, vmask);
+ vx = _mm256_mul_ps(vx, vscale);
+ vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+
+ const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+ __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+ vy = _mm_adds_epi16(vy, voutput_zero_point);
+ vy = _mm_packus_epi16(vy, vy);
+ vy = _mm_max_epu8(vy, voutput_min);
+
+ if (n & (4 * sizeof(float))) {
+ *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+ y += 4;
+ vy = _mm_srli_epi64(vy, 32);
+ }
+ if (n & (2 * sizeof(float))) {
+ *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+ y += 2;
+ vy = _mm_srli_epi32(vy, 16);
+ }
+ if (n & (1 * sizeof(float))) {
+ *y = (uint8_t) _mm_extract_epi8(vy, 0);
+ }
+ }
+}
diff --git a/src/f32-qu8-vcvt/gen/vcvt-avx-x32.c b/src/f32-qu8-vcvt/gen/vcvt-avx-x32.c
new file mode 100644
index 0000000..547bd75
--- /dev/null
+++ b/src/f32-qu8-vcvt/gen/vcvt-avx-x32.c
@@ -0,0 +1,122 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-qs8-vcvt/avx.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_f32_qu8_vcvt_ukernel__avx_x32(
+ size_t n,
+ const float* x,
+ uint8_t* y,
+ const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(float) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
+
+ for (; n >= 32 * sizeof(float); n -= 32 * sizeof(float)) {
+ __m256 vx01234567 = _mm256_loadu_ps(x);
+ __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
+ __m256 vxGHIJKLMN = _mm256_loadu_ps(x + 16);
+ __m256 vxOPQRSTUV = _mm256_loadu_ps(x + 24);
+ x += 32;
+
+ vx01234567 = _mm256_mul_ps(vx01234567, vscale);
+ vx89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vscale);
+ vxGHIJKLMN = _mm256_mul_ps(vxGHIJKLMN, vscale);
+ vxOPQRSTUV = _mm256_mul_ps(vxOPQRSTUV, vscale);
+
+ vx01234567 = _mm256_min_ps(vx01234567, voutput_max_less_zero_point);
+ vx89ABCDEF = _mm256_min_ps(vx89ABCDEF, voutput_max_less_zero_point);
+ vxGHIJKLMN = _mm256_min_ps(vxGHIJKLMN, voutput_max_less_zero_point);
+ vxOPQRSTUV = _mm256_min_ps(vxOPQRSTUV, voutput_max_less_zero_point);
+
+ const __m256i vacc01234567 = _mm256_cvtps_epi32(vx01234567);
+ const __m256i vacc89ABCDEF = _mm256_cvtps_epi32(vx89ABCDEF);
+ const __m256i vaccGHIJKLMN = _mm256_cvtps_epi32(vxGHIJKLMN);
+ const __m256i vaccOPQRSTUV = _mm256_cvtps_epi32(vxOPQRSTUV);
+
+ __m128i vy01234567 = _mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extractf128_si256(vacc01234567, 1));
+ __m128i vy89ABCDEF = _mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extractf128_si256(vacc89ABCDEF, 1));
+ __m128i vyGHIJKLMN = _mm_packs_epi32(_mm256_castsi256_si128(vaccGHIJKLMN), _mm256_extractf128_si256(vaccGHIJKLMN, 1));
+ __m128i vyOPQRSTUV = _mm_packs_epi32(_mm256_castsi256_si128(vaccOPQRSTUV), _mm256_extractf128_si256(vaccOPQRSTUV, 1));
+
+ vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
+ vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
+ vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
+ vyOPQRSTUV = _mm_adds_epi16(vyOPQRSTUV, voutput_zero_point);
+
+ __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF);
+ __m128i vyGHIJKLMNOPQRSTUV = _mm_packus_epi16(vyGHIJKLMN, vyOPQRSTUV);
+
+ vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min);
+ vyGHIJKLMNOPQRSTUV = _mm_max_epu8(vyGHIJKLMNOPQRSTUV, voutput_min);
+
+ _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
+ _mm_storeu_si128((__m128i*) (y + 16), vyGHIJKLMNOPQRSTUV);
+ y += 32;
+ }
+ for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+ __m256 vx = _mm256_loadu_ps(x);
+ vx = _mm256_mul_ps(vx, vscale);
+ vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+ x += 8;
+
+ const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+ __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+ vy = _mm_adds_epi16(vy, voutput_zero_point);
+ vy = _mm_packus_epi16(vy, vy);
+ vy = _mm_max_epu8(vy, voutput_min);
+
+ _mm_storel_epi64((__m128i*) y, vy);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(float));
+ assert(n <= 7 * sizeof(float));
+ __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
+
+ __m256 vx = _mm256_maskload_ps(x, vmask);
+ vx = _mm256_mul_ps(vx, vscale);
+ vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+
+ const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+ __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+ vy = _mm_adds_epi16(vy, voutput_zero_point);
+ vy = _mm_packus_epi16(vy, vy);
+ vy = _mm_max_epu8(vy, voutput_min);
+
+ if (n & (4 * sizeof(float))) {
+ *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+ y += 4;
+ vy = _mm_srli_epi64(vy, 32);
+ }
+ if (n & (2 * sizeof(float))) {
+ *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+ y += 2;
+ vy = _mm_srli_epi32(vy, 16);
+ }
+ if (n & (1 * sizeof(float))) {
+ *y = (uint8_t) _mm_extract_epi8(vy, 0);
+ }
+ }
+}
diff --git a/src/f32-qu8-vcvt/gen/vcvt-avx-x8.c b/src/f32-qu8-vcvt/gen/vcvt-avx-x8.c
new file mode 100644
index 0000000..a75ede7
--- /dev/null
+++ b/src/f32-qu8-vcvt/gen/vcvt-avx-x8.c
@@ -0,0 +1,80 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-qs8-vcvt/avx.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_f32_qu8_vcvt_ukernel__avx_x8(
+ size_t n,
+ const float* x,
+ uint8_t* y,
+ const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(float) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
+
+ for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+ __m256 vx = _mm256_loadu_ps(x);
+ vx = _mm256_mul_ps(vx, vscale);
+ vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+ x += 8;
+
+ const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+ __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+ vy = _mm_adds_epi16(vy, voutput_zero_point);
+ vy = _mm_packus_epi16(vy, vy);
+ vy = _mm_max_epu8(vy, voutput_min);
+
+ _mm_storel_epi64((__m128i*) y, vy);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(float));
+ assert(n <= 7 * sizeof(float));
+ __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
+
+ __m256 vx = _mm256_maskload_ps(x, vmask);
+ vx = _mm256_mul_ps(vx, vscale);
+ vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+
+ const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+ __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+ vy = _mm_adds_epi16(vy, voutput_zero_point);
+ vy = _mm_packus_epi16(vy, vy);
+ vy = _mm_max_epu8(vy, voutput_min);
+
+ if (n & (4 * sizeof(float))) {
+ *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+ y += 4;
+ vy = _mm_srli_epi64(vy, 32);
+ }
+ if (n & (2 * sizeof(float))) {
+ *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+ y += 2;
+ vy = _mm_srli_epi32(vy, 16);
+ }
+ if (n & (1 * sizeof(float))) {
+ *y = (uint8_t) _mm_extract_epi8(vy, 0);
+ }
+ }
+}
diff --git a/src/init.c b/src/init.c
index f99ad26..89ffd3f 100644
--- a/src/init.c
+++ b/src/init.c
@@ -3517,7 +3517,13 @@
xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse2_int16_x32;
xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse2_x16;
}
- if (cpuinfo_has_x86_sse4_1()) {
+ if (cpuinfo_has_x86_avx()) {
+ xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx_x32,
+ .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx_params,
+ .element_tile = 32,
+ };
+ } else if (cpuinfo_has_x86_sse4_1()) {
xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
.ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__sse41_x32,
.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse4_params,
@@ -3530,11 +3536,19 @@
.element_tile = 32,
};
}
- xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
- .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__sse2_x32,
- .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_sse2_params,
- .element_tile = 32,
- };
+ if (cpuinfo_has_x86_avx()) {
+ xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx_x32,
+ .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx_params,
+ .element_tile = 32,
+ };
+ } else {
+ xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__sse2_x32,
+ .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_sse2_params,
+ .element_tile = 32,
+ };
+ }
if (cpuinfo_has_x86_sse4_1()) {
xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
.ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__sse41_x16,
diff --git a/src/params-init.c b/src/params-init.c
index 9c77f97..83d0f25 100644
--- a/src/params-init.c
+++ b/src/params-init.c
@@ -2831,6 +2831,32 @@
params->sse4.output_min[i] = output_min;
}
}
+
+XNN_INTERNAL void xnn_init_f32_qs8_cvt_avx_params(
+ union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
+ float scale,
+ int8_t output_zero_point,
+ int8_t output_min,
+ int8_t output_max)
+{
+ const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
+ for (uint32_t i = 0; i < 8; i++) {
+ params->avx.scale[i] = scale;
+ params->avx.output_max_less_zero_point[i] = output_max_less_zero_point;
+ }
+ for (uint32_t i = 0; i < 8; i++) {
+ params->avx.output_zero_point[i] = (int16_t) output_zero_point;
+ }
+ for (uint32_t i = 0; i < 16; i++) {
+ params->avx.output_min[i] = output_min;
+ }
+ for (uint32_t i = 0; i < 7; i++) {
+ params->avx.mask_table[i] = -1;
+ }
+ for (uint32_t i = 7; i < 14; i++) {
+ params->avx.mask_table[i] = 0;
+ }
+}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD
@@ -2954,6 +2980,32 @@
params->sse2.output_min[i] = output_min;
}
}
+
+XNN_INTERNAL void xnn_init_f32_qu8_cvt_avx_params(
+ union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
+ float scale,
+ uint8_t output_zero_point,
+ uint8_t output_min,
+ uint8_t output_max)
+{
+ const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
+ for (uint32_t i = 0; i < 8; i++) {
+ params->avx.scale[i] = scale;
+ params->avx.output_max_less_zero_point[i] = output_max_less_zero_point;
+ }
+ for (uint32_t i = 0; i < 8; i++) {
+ params->avx.output_zero_point[i] = (int16_t) output_zero_point;
+ }
+ for (uint32_t i = 0; i < 16; i++) {
+ params->avx.output_min[i] = output_min;
+ }
+ for (uint32_t i = 0; i < 7; i++) {
+ params->avx.mask_table[i] = -1;
+ }
+ for (uint32_t i = 7; i < 14; i++) {
+ params->avx.mask_table[i] = 0;
+ }
+}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD
diff --git a/src/xnnpack/params-init.h b/src/xnnpack/params-init.h
index bc05495..99d6b80 100644
--- a/src/xnnpack/params-init.h
+++ b/src/xnnpack/params-init.h
@@ -807,6 +807,13 @@
int8_t output_zero_point,
int8_t output_min,
int8_t output_max);
+
+XNN_INTERNAL void xnn_init_f32_qs8_cvt_avx_params(
+ union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
+ float scale,
+ int8_t output_zero_point,
+ int8_t output_min,
+ int8_t output_max);
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD
@@ -862,6 +869,13 @@
uint8_t output_zero_point,
uint8_t output_min,
uint8_t output_max);
+
+XNN_INTERNAL void xnn_init_f32_qu8_cvt_avx_params(
+ union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
+ float scale,
+ uint8_t output_zero_point,
+ uint8_t output_min,
+ uint8_t output_max);
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index c8f22bc..55296d3 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -896,6 +896,13 @@
XNN_ALIGN(16) int16_t output_zero_point[8];
XNN_ALIGN(16) int8_t output_min[16];
} sse4;
+ struct {
+ XNN_ALIGN(32) float scale[8];
+ XNN_ALIGN(32) float output_max_less_zero_point[8];
+ XNN_ALIGN(16) int16_t output_zero_point[8];
+ XNN_ALIGN(16) int8_t output_min[16];
+ int32_t mask_table[14];
+ } avx;
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD
struct {
@@ -951,6 +958,13 @@
XNN_ALIGN(16) int16_t output_zero_point[8];
XNN_ALIGN(16) uint8_t output_min[16];
} sse2;
+ struct {
+ XNN_ALIGN(32) float scale[8];
+ XNN_ALIGN(32) float output_max_less_zero_point[8];
+ XNN_ALIGN(16) int16_t output_zero_point[8];
+ XNN_ALIGN(16) uint8_t output_min[16];
+ int32_t mask_table[14];
+ } avx;
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD
struct {
diff --git a/src/xnnpack/vcvt.h b/src/xnnpack/vcvt.h
index 4df995b..3422d43 100644
--- a/src/xnnpack/vcvt.h
+++ b/src/xnnpack/vcvt.h
@@ -285,6 +285,11 @@
DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__sse41_x24)
DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__sse41_x32)
+DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx_x8)
+DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx_x16)
+DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx_x24)
+DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx_x32)
+
DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__wasmsimd_x8)
DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__wasmsimd_x16)
DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__wasmsimd_x24)
@@ -318,6 +323,11 @@
DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__sse41_x24)
DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__sse41_x32)
+DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx_x8)
+DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx_x16)
+DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx_x24)
+DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx_x32)
+
DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__wasmsimd_x8)
DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__wasmsimd_x16)
DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__wasmsimd_x24)
diff --git a/test/f32-qs8-vcvt.cc b/test/f32-qs8-vcvt.cc
index 458bf7a..b8de37f 100644
--- a/test/f32-qs8-vcvt.cc
+++ b/test/f32-qs8-vcvt.cc
@@ -1985,6 +1985,498 @@
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(F32_QS8_VCVT__AVX_X8, batch_eq_8) {
+ TEST_REQUIRES_X86_AVX;
+ VCvtMicrokernelTester()
+ .batch_size(8)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X8, batch_div_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X8, batch_lt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X8, batch_gt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X8, scale) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X8, zero_point) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X8, saturation) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X8, overflow) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(4294967296.0f)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X8, qmin) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t qmin = -128; qmin < 127; qmin += 51) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .qmin(qmin)
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X8, qmax) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t qmax = -127; qmax <= 127; qmax += 51) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(qmax)
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(F32_QS8_VCVT__AVX_X16, batch_eq_16) {
+ TEST_REQUIRES_X86_AVX;
+ VCvtMicrokernelTester()
+ .batch_size(16)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X16, batch_div_16) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X16, batch_lt_16) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X16, batch_gt_16) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X16, scale) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X16, zero_point) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X16, saturation) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X16, overflow) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(4294967296.0f)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X16, qmin) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t qmin = -128; qmin < 127; qmin += 51) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .qmin(qmin)
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X16, qmax) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t qmax = -127; qmax <= 127; qmax += 51) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(qmax)
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(F32_QS8_VCVT__AVX_X24, batch_eq_24) {
+ TEST_REQUIRES_X86_AVX;
+ VCvtMicrokernelTester()
+ .batch_size(24)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X24, batch_div_24) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X24, batch_lt_24) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size < 24; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X24, batch_gt_24) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 25; batch_size < 48; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X24, scale) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X24, zero_point) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X24, saturation) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X24, overflow) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(4294967296.0f)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X24, qmin) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t qmin = -128; qmin < 127; qmin += 51) {
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .qmin(qmin)
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X24, qmax) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t qmax = -127; qmax <= 127; qmax += 51) {
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(qmax)
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(F32_QS8_VCVT__AVX_X32, batch_eq_32) {
+ TEST_REQUIRES_X86_AVX;
+ VCvtMicrokernelTester()
+ .batch_size(32)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X32, batch_div_32) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X32, batch_lt_32) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X32, batch_gt_32) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X32, scale) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X32, zero_point) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X32, saturation) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X32, overflow) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(4294967296.0f)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X32, qmin) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t qmin = -128; qmin < 127; qmin += 51) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .qmin(qmin)
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+ }
+
+ TEST(F32_QS8_VCVT__AVX_X32, qmax) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t qmax = -127; qmax <= 127; qmax += 51) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(qmax)
+ .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
#if XNN_ARCH_WASMSIMD
TEST(F32_QS8_VCVT__WASMSIMD_CVT_X8, batch_eq_8) {
VCvtMicrokernelTester()
diff --git a/test/f32-qs8-vcvt.yaml b/test/f32-qs8-vcvt.yaml
index 674a79d..3937810 100644
--- a/test/f32-qs8-vcvt.yaml
+++ b/test/f32-qs8-vcvt.yaml
@@ -35,6 +35,14 @@
init: xnn_init_f32_qs8_cvt_sse4_params
- name: xnn_f32_qs8_vcvt_ukernel__sse41_x32
init: xnn_init_f32_qs8_cvt_sse4_params
+- name: xnn_f32_qs8_vcvt_ukernel__avx_x8
+ init: xnn_init_f32_qs8_cvt_avx_params
+- name: xnn_f32_qs8_vcvt_ukernel__avx_x16
+ init: xnn_init_f32_qs8_cvt_avx_params
+- name: xnn_f32_qs8_vcvt_ukernel__avx_x24
+ init: xnn_init_f32_qs8_cvt_avx_params
+- name: xnn_f32_qs8_vcvt_ukernel__avx_x32
+ init: xnn_init_f32_qs8_cvt_avx_params
- name: xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_x8
init: xnn_init_f32_qs8_cvt_wasmsimd_cvt_params
- name: xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_x16
diff --git a/test/f32-qu8-vcvt.cc b/test/f32-qu8-vcvt.cc
index 7d6e41f..b604d30 100644
--- a/test/f32-qu8-vcvt.cc
+++ b/test/f32-qu8-vcvt.cc
@@ -1541,6 +1541,514 @@
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(F32_QU8_VCVT__AVX_X8, batch_eq_8) {
+ TEST_REQUIRES_X86_AVX;
+ VCvtMicrokernelTester()
+ .batch_size(8)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X8, batch_div_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X8, batch_lt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X8, batch_gt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X8, scale) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .zero_point(100)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X8, zero_point) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X8, saturation) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .zero_point(128)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X8, overflow) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(4294967296.0f)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X8, qmin) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t qmin = 0; qmin < 255; qmin += 51) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .zero_point(128)
+ .qmin(qmin)
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X8, qmax) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t qmax = 1; qmax <= 255; qmax += 51) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .zero_point(128)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(qmax)
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(F32_QU8_VCVT__AVX_X16, batch_eq_16) {
+ TEST_REQUIRES_X86_AVX;
+ VCvtMicrokernelTester()
+ .batch_size(16)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X16, batch_div_16) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X16, batch_lt_16) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X16, batch_gt_16) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X16, scale) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .zero_point(100)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X16, zero_point) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X16, saturation) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .zero_point(128)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X16, overflow) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(4294967296.0f)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X16, qmin) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t qmin = 0; qmin < 255; qmin += 51) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .zero_point(128)
+ .qmin(qmin)
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X16, qmax) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t qmax = 1; qmax <= 255; qmax += 51) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .zero_point(128)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(qmax)
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(F32_QU8_VCVT__AVX_X24, batch_eq_24) {
+ TEST_REQUIRES_X86_AVX;
+ VCvtMicrokernelTester()
+ .batch_size(24)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X24, batch_div_24) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X24, batch_lt_24) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size < 24; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X24, batch_gt_24) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 25; batch_size < 48; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X24, scale) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .zero_point(100)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X24, zero_point) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X24, saturation) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .zero_point(128)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X24, overflow) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(4294967296.0f)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X24, qmin) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t qmin = 0; qmin < 255; qmin += 51) {
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .zero_point(128)
+ .qmin(qmin)
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X24, qmax) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t qmax = 1; qmax <= 255; qmax += 51) {
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .zero_point(128)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(qmax)
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(F32_QU8_VCVT__AVX_X32, batch_eq_32) {
+ TEST_REQUIRES_X86_AVX;
+ VCvtMicrokernelTester()
+ .batch_size(32)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X32, batch_div_32) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X32, batch_lt_32) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X32, batch_gt_32) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X32, scale) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .zero_point(100)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X32, zero_point) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X32, saturation) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .zero_point(128)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X32, overflow) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(4294967296.0f)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X32, qmin) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t qmin = 0; qmin < 255; qmin += 51) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .zero_point(128)
+ .qmin(qmin)
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+ }
+
+ TEST(F32_QU8_VCVT__AVX_X32, qmax) {
+ TEST_REQUIRES_X86_AVX;
+ for (int16_t qmax = 1; qmax <= 255; qmax += 51) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(500)
+ .zero_point(128)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(qmax)
+ .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
#if XNN_ARCH_WASMSIMD
TEST(F32_QU8_VCVT__WASMSIMD_CVT_X8, batch_eq_8) {
VCvtMicrokernelTester()
diff --git a/test/f32-qu8-vcvt.yaml b/test/f32-qu8-vcvt.yaml
index e00c709..f31df03 100644
--- a/test/f32-qu8-vcvt.yaml
+++ b/test/f32-qu8-vcvt.yaml
@@ -27,6 +27,14 @@
init: xnn_init_f32_qu8_cvt_sse2_params
- name: xnn_f32_qu8_vcvt_ukernel__sse2_x32
init: xnn_init_f32_qu8_cvt_sse2_params
+- name: xnn_f32_qu8_vcvt_ukernel__avx_x8
+ init: xnn_init_f32_qu8_cvt_avx_params
+- name: xnn_f32_qu8_vcvt_ukernel__avx_x16
+ init: xnn_init_f32_qu8_cvt_avx_params
+- name: xnn_f32_qu8_vcvt_ukernel__avx_x24
+ init: xnn_init_f32_qu8_cvt_avx_params
+- name: xnn_f32_qu8_vcvt_ukernel__avx_x32
+ init: xnn_init_f32_qu8_cvt_avx_params
- name: xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_x8
init: xnn_init_f32_qu8_cvt_wasmsimd_cvt_params
- name: xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_x16