AVX F32->QS8 and F32->QU8 VCVT microkernels

PiperOrigin-RevId: 416426700
diff --git a/BUILD.bazel b/BUILD.bazel
index 940277d..e53b44b 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -4968,6 +4968,8 @@
     "src/f32-igemm/gen/1x16-minmax-avx-broadcast.c",
     "src/f32-igemm/gen/5x16-minmax-avx-broadcast.c",
     "src/f32-prelu/gen/avx-2x16.c",
+    "src/f32-qs8-vcvt/gen/vcvt-avx-x32.c",
+    "src/f32-qu8-vcvt/gen/vcvt-avx-x32.c",
     "src/f32-vbinary/gen/vadd-minmax-avx-x16.c",
     "src/f32-vbinary/gen/vaddc-minmax-avx-x16.c",
     "src/f32-vbinary/gen/vdiv-minmax-avx-x16.c",
@@ -5084,6 +5086,14 @@
     "src/f32-igemm/gen/7x8-minmax-avx-broadcast.c",
     "src/f32-prelu/gen/avx-2x8.c",
     "src/f32-prelu/gen/avx-2x16.c",
+    "src/f32-qs8-vcvt/gen/vcvt-avx-x8.c",
+    "src/f32-qs8-vcvt/gen/vcvt-avx-x16.c",
+    "src/f32-qs8-vcvt/gen/vcvt-avx-x24.c",
+    "src/f32-qs8-vcvt/gen/vcvt-avx-x32.c",
+    "src/f32-qu8-vcvt/gen/vcvt-avx-x8.c",
+    "src/f32-qu8-vcvt/gen/vcvt-avx-x16.c",
+    "src/f32-qu8-vcvt/gen/vcvt-avx-x24.c",
+    "src/f32-qu8-vcvt/gen/vcvt-avx-x32.c",
     "src/f32-rmax/avx.c",
     "src/f32-vbinary/gen/vadd-minmax-avx-x8.c",
     "src/f32-vbinary/gen/vadd-minmax-avx-x16.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9176b70..bc1deed 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3906,6 +3906,8 @@
   src/f32-igemm/gen/1x16-minmax-avx-broadcast.c
   src/f32-igemm/gen/5x16-minmax-avx-broadcast.c
   src/f32-prelu/gen/avx-2x16.c
+  src/f32-qs8-vcvt/gen/vcvt-avx-x32.c
+  src/f32-qu8-vcvt/gen/vcvt-avx-x32.c
   src/f32-vbinary/gen/vadd-minmax-avx-x16.c
   src/f32-vbinary/gen/vaddc-minmax-avx-x16.c
   src/f32-vbinary/gen/vdiv-minmax-avx-x16.c
@@ -4021,6 +4023,14 @@
   src/f32-igemm/gen/7x8-minmax-avx-broadcast.c
   src/f32-prelu/gen/avx-2x8.c
   src/f32-prelu/gen/avx-2x16.c
+  src/f32-qs8-vcvt/gen/vcvt-avx-x8.c
+  src/f32-qs8-vcvt/gen/vcvt-avx-x16.c
+  src/f32-qs8-vcvt/gen/vcvt-avx-x24.c
+  src/f32-qs8-vcvt/gen/vcvt-avx-x32.c
+  src/f32-qu8-vcvt/gen/vcvt-avx-x8.c
+  src/f32-qu8-vcvt/gen/vcvt-avx-x16.c
+  src/f32-qu8-vcvt/gen/vcvt-avx-x24.c
+  src/f32-qu8-vcvt/gen/vcvt-avx-x32.c
   src/f32-rmax/avx.c
   src/f32-vbinary/gen/vadd-minmax-avx-x8.c
   src/f32-vbinary/gen/vadd-minmax-avx-x16.c
diff --git a/bench/f32-qs8-vcvt.cc b/bench/f32-qs8-vcvt.cc
index eb9fff3..95e296d 100644
--- a/bench/f32-qs8-vcvt.cc
+++ b/bench/f32-qs8-vcvt.cc
@@ -118,6 +118,31 @@
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  BENCHMARK_CAPTURE(f32_qs8_vcvt, avx_x8,
+                    xnn_f32_qs8_vcvt_ukernel__avx_x8,
+                    xnn_init_f32_qs8_cvt_avx_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f32_qs8_vcvt, avx_x16,
+                    xnn_f32_qs8_vcvt_ukernel__avx_x16,
+                    xnn_init_f32_qs8_cvt_avx_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f32_qs8_vcvt, avx_x24,
+                    xnn_f32_qs8_vcvt_ukernel__avx_x24,
+                    xnn_init_f32_qs8_cvt_avx_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f32_qs8_vcvt, avx_x32,
+                    xnn_f32_qs8_vcvt_ukernel__avx_x32,
+                    xnn_init_f32_qs8_cvt_avx_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+    ->UseRealTime();
+
   BENCHMARK_CAPTURE(f32_qs8_vcvt, sse41_x8,
                     xnn_f32_qs8_vcvt_ukernel__sse41_x8,
                     xnn_init_f32_qs8_cvt_sse4_params,
diff --git a/bench/f32-qu8-vcvt.cc b/bench/f32-qu8-vcvt.cc
index 187d811..3f7f2c2 100644
--- a/bench/f32-qu8-vcvt.cc
+++ b/bench/f32-qu8-vcvt.cc
@@ -118,6 +118,31 @@
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  BENCHMARK_CAPTURE(f32_qu8_vcvt, avx_x8,
+                    xnn_f32_qu8_vcvt_ukernel__avx_x8,
+                    xnn_init_f32_qu8_cvt_avx_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f32_qu8_vcvt, avx_x16,
+                    xnn_f32_qu8_vcvt_ukernel__avx_x16,
+                    xnn_init_f32_qu8_cvt_avx_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f32_qu8_vcvt, avx_x24,
+                    xnn_f32_qu8_vcvt_ukernel__avx_x24,
+                    xnn_init_f32_qu8_cvt_avx_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f32_qu8_vcvt, avx_x32,
+                    xnn_f32_qu8_vcvt_ukernel__avx_x32,
+                    xnn_init_f32_qu8_cvt_avx_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+    ->UseRealTime();
+
   BENCHMARK_CAPTURE(f32_qu8_vcvt, sse2_x8,
                     xnn_f32_qu8_vcvt_ukernel__sse2_x8,
                     xnn_init_f32_qu8_cvt_sse2_params)
diff --git a/scripts/generate-f32-qs8-vcvt.sh b/scripts/generate-f32-qs8-vcvt.sh
index 7f31123..0c80555 100755
--- a/scripts/generate-f32-qs8-vcvt.sh
+++ b/scripts/generate-f32-qs8-vcvt.sh
@@ -41,6 +41,17 @@
 tools/xngen src/f32-qs8-vcvt/sse.c.in -D SSE=2 -D BATCH_TILE=24 -D DATATYPE=QU8 -o src/f32-qu8-vcvt/gen/vcvt-sse2-x24.c &
 tools/xngen src/f32-qs8-vcvt/sse.c.in -D SSE=2 -D BATCH_TILE=32 -D DATATYPE=QU8 -o src/f32-qu8-vcvt/gen/vcvt-sse2-x32.c &
 
+################################# x86 256-bit #################################
+tools/xngen src/f32-qs8-vcvt/avx.c.in -D BATCH_TILE=8  -D DATATYPE=QS8 -o src/f32-qs8-vcvt/gen/vcvt-avx-x8.c &
+tools/xngen src/f32-qs8-vcvt/avx.c.in -D BATCH_TILE=16 -D DATATYPE=QS8 -o src/f32-qs8-vcvt/gen/vcvt-avx-x16.c &
+tools/xngen src/f32-qs8-vcvt/avx.c.in -D BATCH_TILE=24 -D DATATYPE=QS8 -o src/f32-qs8-vcvt/gen/vcvt-avx-x24.c &
+tools/xngen src/f32-qs8-vcvt/avx.c.in -D BATCH_TILE=32 -D DATATYPE=QS8 -o src/f32-qs8-vcvt/gen/vcvt-avx-x32.c &
+
+tools/xngen src/f32-qs8-vcvt/avx.c.in -D BATCH_TILE=8  -D DATATYPE=QU8 -o src/f32-qu8-vcvt/gen/vcvt-avx-x8.c &
+tools/xngen src/f32-qs8-vcvt/avx.c.in -D BATCH_TILE=16 -D DATATYPE=QU8 -o src/f32-qu8-vcvt/gen/vcvt-avx-x16.c &
+tools/xngen src/f32-qs8-vcvt/avx.c.in -D BATCH_TILE=24 -D DATATYPE=QU8 -o src/f32-qu8-vcvt/gen/vcvt-avx-x24.c &
+tools/xngen src/f32-qs8-vcvt/avx.c.in -D BATCH_TILE=32 -D DATATYPE=QU8 -o src/f32-qu8-vcvt/gen/vcvt-avx-x32.c &
+
 ################################## WAsm SIMD ##################################
 tools/xngen src/f32-qs8-vcvt/wasmsimd-cvt.c.in -D BATCH_TILE=8  -D DATATYPE=QS8 -o src/f32-qs8-vcvt/gen/vcvt-wasmsimd-cvt-x8.c &
 tools/xngen src/f32-qs8-vcvt/wasmsimd-cvt.c.in -D BATCH_TILE=16 -D DATATYPE=QS8 -o src/f32-qs8-vcvt/gen/vcvt-wasmsimd-cvt-x16.c &
diff --git a/src/f32-qs8-vcvt/avx.c.in b/src/f32-qs8-vcvt/avx.c.in
new file mode 100644
index 0000000..66d9e2d
--- /dev/null
+++ b/src/f32-qs8-vcvt/avx.c.in
@@ -0,0 +1,125 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert DATATYPE in ["QS8", "QU8"]
+$assert BATCH_TILE % 8 == 0
+$assert BATCH_TILE >= 8
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
+$_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE]
+$_MM_MAX_EPX8 = {"QS8": "_mm_max_epi8", "QU8": "_mm_max_epu8"}[DATATYPE]
+void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx_x${BATCH_TILE}(
+    size_t n,
+    const float* x,
+    ${XINT8_T}* y,
+    const union xnn_f32_${DATATYPE.lower()}_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(x != NULL);
+  assert(y != NULL);
+
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
+  const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
+  const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
+
+  $if BATCH_TILE > 8:
+    for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
+      __m256 vx${ABC[0:8]} = _mm256_loadu_ps(x);
+      $for N in range(8, BATCH_TILE, 8):
+        __m256 vx${ABC[N:N+8]} = _mm256_loadu_ps(x + ${N});
+      x += ${BATCH_TILE};
+
+      $for N in range(0, BATCH_TILE, 8):
+        vx${ABC[N:N+8]} = _mm256_mul_ps(vx${ABC[N:N+8]}, vscale);
+
+      $for N in range(0, BATCH_TILE, 8):
+        vx${ABC[N:N+8]} = _mm256_min_ps(vx${ABC[N:N+8]}, voutput_max_less_zero_point);
+
+      $for N in range(0, BATCH_TILE, 8):
+        const __m256i vacc${ABC[N:N+8]} = _mm256_cvtps_epi32(vx${ABC[N:N+8]});
+
+      $for N in range(0, BATCH_TILE, 8):
+        __m128i vy${ABC[N:N+8]} = _mm_packs_epi32(_mm256_castsi256_si128(vacc${ABC[N:N+8]}), _mm256_extractf128_si256(vacc${ABC[N:N+8]}, 1));
+
+      $for N in range(0, BATCH_TILE, 8):
+        vy${ABC[N:N+8]} = _mm_adds_epi16(vy${ABC[N:N+8]}, voutput_zero_point);
+
+      $for N in range(0, BATCH_TILE, 16):
+        $if N + 8 < BATCH_TILE:
+          __m128i vy${ABC[N:N+16]} = ${_MM_PACKXS_EPI16}(vy${ABC[N:N+8]}, vy${ABC[N+8:N+16]});
+        $else:
+          vy${ABC[N:N+8]} = ${_MM_PACKXS_EPI16}(vy${ABC[N:N+8]}, vy${ABC[N:N+8]});
+
+      $for N in range(0, BATCH_TILE, 16):
+        $if N + 8 < BATCH_TILE:
+          vy${ABC[N:N+16]} = ${_MM_MAX_EPX8}(vy${ABC[N:N+16]}, voutput_min);
+        $else:
+          vy${ABC[N:N+8]} = ${_MM_MAX_EPX8}(vy${ABC[N:N+8]}, voutput_min);
+
+      _mm_storeu_si128((__m128i*) y, vy${ABC[0:16]});
+      $for N in range(16, BATCH_TILE, 16):
+        $if N + 8 < BATCH_TILE:
+          _mm_storeu_si128((__m128i*) (y + ${N}), vy${ABC[N:N+16]});
+        $else:
+          _mm_storel_epi64((__m128i*) (y + ${N}), vy${ABC[N:N+8]});
+      y += ${BATCH_TILE};
+    }
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    __m256 vx = _mm256_loadu_ps(x);
+    vx = _mm256_mul_ps(vx, vscale);
+    vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+    x += 8;
+
+    const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+    __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+    vy = _mm_adds_epi16(vy, voutput_zero_point);
+    vy = ${_MM_PACKXS_EPI16}(vy, vy);
+    vy = ${_MM_MAX_EPX8}(vy, voutput_min);
+
+    _mm_storel_epi64((__m128i*) y, vy);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(float));
+    assert(n <= 7 * sizeof(float));
+    __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
+
+    __m256 vx = _mm256_maskload_ps(x, vmask);
+    vx = _mm256_mul_ps(vx, vscale);
+    vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+
+    const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+    __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+    vy = _mm_adds_epi16(vy, voutput_zero_point);
+    vy = ${_MM_PACKXS_EPI16}(vy, vy);
+    vy = ${_MM_MAX_EPX8}(vy, voutput_min);
+
+    if (n & (4 * sizeof(float))) {
+      *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+      y += 4;
+      vy = _mm_srli_epi64(vy, 32);
+    }
+    if (n & (2 * sizeof(float))) {
+      *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+      y += 2;
+      vy = _mm_srli_epi32(vy, 16);
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = (${XINT8_T}) _mm_extract_epi8(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-qs8-vcvt/gen/vcvt-avx-x16.c b/src/f32-qs8-vcvt/gen/vcvt-avx-x16.c
new file mode 100644
index 0000000..eae490f
--- /dev/null
+++ b/src/f32-qs8-vcvt/gen/vcvt-avx-x16.c
@@ -0,0 +1,107 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-qs8-vcvt/avx.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_f32_qs8_vcvt_ukernel__avx_x16(
+    size_t n,
+    const float* x,
+    int8_t* y,
+    const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(x != NULL);
+  assert(y != NULL);
+
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
+  const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
+  const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    __m256 vx01234567 = _mm256_loadu_ps(x);
+    __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
+    x += 16;
+
+    vx01234567 = _mm256_mul_ps(vx01234567, vscale);
+    vx89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vscale);
+
+    vx01234567 = _mm256_min_ps(vx01234567, voutput_max_less_zero_point);
+    vx89ABCDEF = _mm256_min_ps(vx89ABCDEF, voutput_max_less_zero_point);
+
+    const __m256i vacc01234567 = _mm256_cvtps_epi32(vx01234567);
+    const __m256i vacc89ABCDEF = _mm256_cvtps_epi32(vx89ABCDEF);
+
+    __m128i vy01234567 = _mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extractf128_si256(vacc01234567, 1));
+    __m128i vy89ABCDEF = _mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extractf128_si256(vacc89ABCDEF, 1));
+
+    vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
+    vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
+
+    __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
+
+    vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min);
+
+    _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
+    y += 16;
+  }
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    __m256 vx = _mm256_loadu_ps(x);
+    vx = _mm256_mul_ps(vx, vscale);
+    vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+    x += 8;
+
+    const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+    __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+    vy = _mm_adds_epi16(vy, voutput_zero_point);
+    vy = _mm_packs_epi16(vy, vy);
+    vy = _mm_max_epi8(vy, voutput_min);
+
+    _mm_storel_epi64((__m128i*) y, vy);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(float));
+    assert(n <= 7 * sizeof(float));
+    __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
+
+    __m256 vx = _mm256_maskload_ps(x, vmask);
+    vx = _mm256_mul_ps(vx, vscale);
+    vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+
+    const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+    __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+    vy = _mm_adds_epi16(vy, voutput_zero_point);
+    vy = _mm_packs_epi16(vy, vy);
+    vy = _mm_max_epi8(vy, voutput_min);
+
+    if (n & (4 * sizeof(float))) {
+      *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+      y += 4;
+      vy = _mm_srli_epi64(vy, 32);
+    }
+    if (n & (2 * sizeof(float))) {
+      *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+      y += 2;
+      vy = _mm_srli_epi32(vy, 16);
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = (int8_t) _mm_extract_epi8(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-qs8-vcvt/gen/vcvt-avx-x24.c b/src/f32-qs8-vcvt/gen/vcvt-avx-x24.c
new file mode 100644
index 0000000..f270737
--- /dev/null
+++ b/src/f32-qs8-vcvt/gen/vcvt-avx-x24.c
@@ -0,0 +1,116 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-qs8-vcvt/avx.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_f32_qs8_vcvt_ukernel__avx_x24(
+    size_t n,
+    const float* x,
+    int8_t* y,
+    const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(x != NULL);
+  assert(y != NULL);
+
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
+  const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
+  const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
+
+  for (; n >= 24 * sizeof(float); n -= 24 * sizeof(float)) {
+    __m256 vx01234567 = _mm256_loadu_ps(x);
+    __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
+    __m256 vxGHIJKLMN = _mm256_loadu_ps(x + 16);
+    x += 24;
+
+    vx01234567 = _mm256_mul_ps(vx01234567, vscale);
+    vx89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vscale);
+    vxGHIJKLMN = _mm256_mul_ps(vxGHIJKLMN, vscale);
+
+    vx01234567 = _mm256_min_ps(vx01234567, voutput_max_less_zero_point);
+    vx89ABCDEF = _mm256_min_ps(vx89ABCDEF, voutput_max_less_zero_point);
+    vxGHIJKLMN = _mm256_min_ps(vxGHIJKLMN, voutput_max_less_zero_point);
+
+    const __m256i vacc01234567 = _mm256_cvtps_epi32(vx01234567);
+    const __m256i vacc89ABCDEF = _mm256_cvtps_epi32(vx89ABCDEF);
+    const __m256i vaccGHIJKLMN = _mm256_cvtps_epi32(vxGHIJKLMN);
+
+    __m128i vy01234567 = _mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extractf128_si256(vacc01234567, 1));
+    __m128i vy89ABCDEF = _mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extractf128_si256(vacc89ABCDEF, 1));
+    __m128i vyGHIJKLMN = _mm_packs_epi32(_mm256_castsi256_si128(vaccGHIJKLMN), _mm256_extractf128_si256(vaccGHIJKLMN, 1));
+
+    vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
+    vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
+    vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
+
+    __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
+    vyGHIJKLMN = _mm_packs_epi16(vyGHIJKLMN, vyGHIJKLMN);
+
+    vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min);
+    vyGHIJKLMN = _mm_max_epi8(vyGHIJKLMN, voutput_min);
+
+    _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
+    _mm_storel_epi64((__m128i*) (y + 16), vyGHIJKLMN);
+    y += 24;
+  }
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    __m256 vx = _mm256_loadu_ps(x);
+    vx = _mm256_mul_ps(vx, vscale);
+    vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+    x += 8;
+
+    const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+    __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+    vy = _mm_adds_epi16(vy, voutput_zero_point);
+    vy = _mm_packs_epi16(vy, vy);
+    vy = _mm_max_epi8(vy, voutput_min);
+
+    _mm_storel_epi64((__m128i*) y, vy);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(float));
+    assert(n <= 7 * sizeof(float));
+    __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
+
+    __m256 vx = _mm256_maskload_ps(x, vmask);
+    vx = _mm256_mul_ps(vx, vscale);
+    vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+
+    const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+    __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+    vy = _mm_adds_epi16(vy, voutput_zero_point);
+    vy = _mm_packs_epi16(vy, vy);
+    vy = _mm_max_epi8(vy, voutput_min);
+
+    if (n & (4 * sizeof(float))) {
+      *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+      y += 4;
+      vy = _mm_srli_epi64(vy, 32);
+    }
+    if (n & (2 * sizeof(float))) {
+      *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+      y += 2;
+      vy = _mm_srli_epi32(vy, 16);
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = (int8_t) _mm_extract_epi8(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-qs8-vcvt/gen/vcvt-avx-x32.c b/src/f32-qs8-vcvt/gen/vcvt-avx-x32.c
new file mode 100644
index 0000000..5ea77e2
--- /dev/null
+++ b/src/f32-qs8-vcvt/gen/vcvt-avx-x32.c
@@ -0,0 +1,122 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-qs8-vcvt/avx.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_f32_qs8_vcvt_ukernel__avx_x32(
+    size_t n,
+    const float* x,
+    int8_t* y,
+    const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(x != NULL);
+  assert(y != NULL);
+
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
+  const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
+  const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
+
+  for (; n >= 32 * sizeof(float); n -= 32 * sizeof(float)) {
+    __m256 vx01234567 = _mm256_loadu_ps(x);
+    __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
+    __m256 vxGHIJKLMN = _mm256_loadu_ps(x + 16);
+    __m256 vxOPQRSTUV = _mm256_loadu_ps(x + 24);
+    x += 32;
+
+    vx01234567 = _mm256_mul_ps(vx01234567, vscale);
+    vx89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vscale);
+    vxGHIJKLMN = _mm256_mul_ps(vxGHIJKLMN, vscale);
+    vxOPQRSTUV = _mm256_mul_ps(vxOPQRSTUV, vscale);
+
+    vx01234567 = _mm256_min_ps(vx01234567, voutput_max_less_zero_point);
+    vx89ABCDEF = _mm256_min_ps(vx89ABCDEF, voutput_max_less_zero_point);
+    vxGHIJKLMN = _mm256_min_ps(vxGHIJKLMN, voutput_max_less_zero_point);
+    vxOPQRSTUV = _mm256_min_ps(vxOPQRSTUV, voutput_max_less_zero_point);
+
+    const __m256i vacc01234567 = _mm256_cvtps_epi32(vx01234567);
+    const __m256i vacc89ABCDEF = _mm256_cvtps_epi32(vx89ABCDEF);
+    const __m256i vaccGHIJKLMN = _mm256_cvtps_epi32(vxGHIJKLMN);
+    const __m256i vaccOPQRSTUV = _mm256_cvtps_epi32(vxOPQRSTUV);
+
+    __m128i vy01234567 = _mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extractf128_si256(vacc01234567, 1));
+    __m128i vy89ABCDEF = _mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extractf128_si256(vacc89ABCDEF, 1));
+    __m128i vyGHIJKLMN = _mm_packs_epi32(_mm256_castsi256_si128(vaccGHIJKLMN), _mm256_extractf128_si256(vaccGHIJKLMN, 1));
+    __m128i vyOPQRSTUV = _mm_packs_epi32(_mm256_castsi256_si128(vaccOPQRSTUV), _mm256_extractf128_si256(vaccOPQRSTUV, 1));
+
+    vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
+    vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
+    vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
+    vyOPQRSTUV = _mm_adds_epi16(vyOPQRSTUV, voutput_zero_point);
+
+    __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
+    __m128i vyGHIJKLMNOPQRSTUV = _mm_packs_epi16(vyGHIJKLMN, vyOPQRSTUV);
+
+    vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min);
+    vyGHIJKLMNOPQRSTUV = _mm_max_epi8(vyGHIJKLMNOPQRSTUV, voutput_min);
+
+    _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
+    _mm_storeu_si128((__m128i*) (y + 16), vyGHIJKLMNOPQRSTUV);
+    y += 32;
+  }
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    __m256 vx = _mm256_loadu_ps(x);
+    vx = _mm256_mul_ps(vx, vscale);
+    vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+    x += 8;
+
+    const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+    __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+    vy = _mm_adds_epi16(vy, voutput_zero_point);
+    vy = _mm_packs_epi16(vy, vy);
+    vy = _mm_max_epi8(vy, voutput_min);
+
+    _mm_storel_epi64((__m128i*) y, vy);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(float));
+    assert(n <= 7 * sizeof(float));
+    __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
+
+    __m256 vx = _mm256_maskload_ps(x, vmask);
+    vx = _mm256_mul_ps(vx, vscale);
+    vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+
+    const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+    __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+    vy = _mm_adds_epi16(vy, voutput_zero_point);
+    vy = _mm_packs_epi16(vy, vy);
+    vy = _mm_max_epi8(vy, voutput_min);
+
+    if (n & (4 * sizeof(float))) {
+      *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+      y += 4;
+      vy = _mm_srli_epi64(vy, 32);
+    }
+    if (n & (2 * sizeof(float))) {
+      *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+      y += 2;
+      vy = _mm_srli_epi32(vy, 16);
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = (int8_t) _mm_extract_epi8(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-qs8-vcvt/gen/vcvt-avx-x8.c b/src/f32-qs8-vcvt/gen/vcvt-avx-x8.c
new file mode 100644
index 0000000..afa47f5
--- /dev/null
+++ b/src/f32-qs8-vcvt/gen/vcvt-avx-x8.c
@@ -0,0 +1,80 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-qs8-vcvt/avx.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_f32_qs8_vcvt_ukernel__avx_x8(
+    size_t n,
+    const float* x,
+    int8_t* y,
+    const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(x != NULL);
+  assert(y != NULL);
+
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
+  const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
+  const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    __m256 vx = _mm256_loadu_ps(x);
+    vx = _mm256_mul_ps(vx, vscale);
+    vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+    x += 8;
+
+    const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+    __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+    vy = _mm_adds_epi16(vy, voutput_zero_point);
+    vy = _mm_packs_epi16(vy, vy);
+    vy = _mm_max_epi8(vy, voutput_min);
+
+    _mm_storel_epi64((__m128i*) y, vy);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(float));
+    assert(n <= 7 * sizeof(float));
+    __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
+
+    __m256 vx = _mm256_maskload_ps(x, vmask);
+    vx = _mm256_mul_ps(vx, vscale);
+    vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+
+    const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+    __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+    vy = _mm_adds_epi16(vy, voutput_zero_point);
+    vy = _mm_packs_epi16(vy, vy);
+    vy = _mm_max_epi8(vy, voutput_min);
+
+    if (n & (4 * sizeof(float))) {
+      *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+      y += 4;
+      vy = _mm_srli_epi64(vy, 32);
+    }
+    if (n & (2 * sizeof(float))) {
+      *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+      y += 2;
+      vy = _mm_srli_epi32(vy, 16);
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = (int8_t) _mm_extract_epi8(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-qs8-vcvt/gen/vcvt-sse41-x16.c b/src/f32-qs8-vcvt/gen/vcvt-sse41-x16.c
index 334d49f..1c66297 100644
--- a/src/f32-qs8-vcvt/gen/vcvt-sse41-x16.c
+++ b/src/f32-qs8-vcvt/gen/vcvt-sse41-x16.c
@@ -119,7 +119,7 @@
       vy = _mm_srli_epi32(vy, 16);
     }
     if (n & (1 * sizeof(float))) {
-      *y = (int8_t) _mm_extract_epi16(vy, 0);
+      *y = (int8_t) _mm_extract_epi8(vy, 0);
     }
   }
 }
diff --git a/src/f32-qs8-vcvt/gen/vcvt-sse41-x24.c b/src/f32-qs8-vcvt/gen/vcvt-sse41-x24.c
index 29e2ef6..d2623e8 100644
--- a/src/f32-qs8-vcvt/gen/vcvt-sse41-x24.c
+++ b/src/f32-qs8-vcvt/gen/vcvt-sse41-x24.c
@@ -132,7 +132,7 @@
       vy = _mm_srli_epi32(vy, 16);
     }
     if (n & (1 * sizeof(float))) {
-      *y = (int8_t) _mm_extract_epi16(vy, 0);
+      *y = (int8_t) _mm_extract_epi8(vy, 0);
     }
   }
 }
diff --git a/src/f32-qs8-vcvt/gen/vcvt-sse41-x32.c b/src/f32-qs8-vcvt/gen/vcvt-sse41-x32.c
index 258ba84..ee2c43d 100644
--- a/src/f32-qs8-vcvt/gen/vcvt-sse41-x32.c
+++ b/src/f32-qs8-vcvt/gen/vcvt-sse41-x32.c
@@ -142,7 +142,7 @@
       vy = _mm_srli_epi32(vy, 16);
     }
     if (n & (1 * sizeof(float))) {
-      *y = (int8_t) _mm_extract_epi16(vy, 0);
+      *y = (int8_t) _mm_extract_epi8(vy, 0);
     }
   }
 }
diff --git a/src/f32-qs8-vcvt/gen/vcvt-sse41-x8.c b/src/f32-qs8-vcvt/gen/vcvt-sse41-x8.c
index 097e082..64d5ca6 100644
--- a/src/f32-qs8-vcvt/gen/vcvt-sse41-x8.c
+++ b/src/f32-qs8-vcvt/gen/vcvt-sse41-x8.c
@@ -83,7 +83,7 @@
       vy = _mm_srli_epi32(vy, 16);
     }
     if (n & (1 * sizeof(float))) {
-      *y = (int8_t) _mm_extract_epi16(vy, 0);
+      *y = (int8_t) _mm_extract_epi8(vy, 0);
     }
   }
 }
diff --git a/src/f32-qs8-vcvt/sse.c.in b/src/f32-qs8-vcvt/sse.c.in
index 1f1f9b5..8d60b9f 100644
--- a/src/f32-qs8-vcvt/sse.c.in
+++ b/src/f32-qs8-vcvt/sse.c.in
@@ -143,7 +143,7 @@
         vy = _mm_srli_epi32(vy, 16);
       }
       if (n & (1 * sizeof(float))) {
-        *y = (${XINT8_T}) _mm_extract_epi16(vy, 0);
+        *y = (${XINT8_T}) _mm_extract_epi8(vy, 0);
       }
     $else:
       {
diff --git a/src/f32-qu8-vcvt/gen/vcvt-avx-x16.c b/src/f32-qu8-vcvt/gen/vcvt-avx-x16.c
new file mode 100644
index 0000000..6fd36b8
--- /dev/null
+++ b/src/f32-qu8-vcvt/gen/vcvt-avx-x16.c
@@ -0,0 +1,107 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-qs8-vcvt/avx.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_f32_qu8_vcvt_ukernel__avx_x16(
+    size_t n,
+    const float* x,
+    uint8_t* y,
+    const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(x != NULL);
+  assert(y != NULL);
+
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
+  const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
+  const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    __m256 vx01234567 = _mm256_loadu_ps(x);
+    __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
+    x += 16;
+
+    vx01234567 = _mm256_mul_ps(vx01234567, vscale);
+    vx89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vscale);
+
+    vx01234567 = _mm256_min_ps(vx01234567, voutput_max_less_zero_point);
+    vx89ABCDEF = _mm256_min_ps(vx89ABCDEF, voutput_max_less_zero_point);
+
+    const __m256i vacc01234567 = _mm256_cvtps_epi32(vx01234567);
+    const __m256i vacc89ABCDEF = _mm256_cvtps_epi32(vx89ABCDEF);
+
+    __m128i vy01234567 = _mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extractf128_si256(vacc01234567, 1));
+    __m128i vy89ABCDEF = _mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extractf128_si256(vacc89ABCDEF, 1));
+
+    vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
+    vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
+
+    __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF);
+
+    vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min);
+
+    _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
+    y += 16;
+  }
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    __m256 vx = _mm256_loadu_ps(x);
+    vx = _mm256_mul_ps(vx, vscale);
+    vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+    x += 8;
+
+    const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+    __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+    vy = _mm_adds_epi16(vy, voutput_zero_point);
+    vy = _mm_packus_epi16(vy, vy);
+    vy = _mm_max_epu8(vy, voutput_min);
+
+    _mm_storel_epi64((__m128i*) y, vy);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(float));
+    assert(n <= 7 * sizeof(float));
+    __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
+
+    __m256 vx = _mm256_maskload_ps(x, vmask);
+    vx = _mm256_mul_ps(vx, vscale);
+    vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+
+    const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+    __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+    vy = _mm_adds_epi16(vy, voutput_zero_point);
+    vy = _mm_packus_epi16(vy, vy);
+    vy = _mm_max_epu8(vy, voutput_min);
+
+    if (n & (4 * sizeof(float))) {
+      *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+      y += 4;
+      vy = _mm_srli_epi64(vy, 32);
+    }
+    if (n & (2 * sizeof(float))) {
+      *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+      y += 2;
+      vy = _mm_srli_epi32(vy, 16);
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = (uint8_t) _mm_extract_epi8(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-qu8-vcvt/gen/vcvt-avx-x24.c b/src/f32-qu8-vcvt/gen/vcvt-avx-x24.c
new file mode 100644
index 0000000..4790d23
--- /dev/null
+++ b/src/f32-qu8-vcvt/gen/vcvt-avx-x24.c
@@ -0,0 +1,116 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-qs8-vcvt/avx.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_f32_qu8_vcvt_ukernel__avx_x24(
+    size_t n,
+    const float* x,
+    uint8_t* y,
+    const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(x != NULL);
+  assert(y != NULL);
+
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
+  const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
+  const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
+
+  for (; n >= 24 * sizeof(float); n -= 24 * sizeof(float)) {
+    __m256 vx01234567 = _mm256_loadu_ps(x);
+    __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
+    __m256 vxGHIJKLMN = _mm256_loadu_ps(x + 16);
+    x += 24;
+
+    vx01234567 = _mm256_mul_ps(vx01234567, vscale);
+    vx89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vscale);
+    vxGHIJKLMN = _mm256_mul_ps(vxGHIJKLMN, vscale);
+
+    vx01234567 = _mm256_min_ps(vx01234567, voutput_max_less_zero_point);
+    vx89ABCDEF = _mm256_min_ps(vx89ABCDEF, voutput_max_less_zero_point);
+    vxGHIJKLMN = _mm256_min_ps(vxGHIJKLMN, voutput_max_less_zero_point);
+
+    const __m256i vacc01234567 = _mm256_cvtps_epi32(vx01234567);
+    const __m256i vacc89ABCDEF = _mm256_cvtps_epi32(vx89ABCDEF);
+    const __m256i vaccGHIJKLMN = _mm256_cvtps_epi32(vxGHIJKLMN);
+
+    __m128i vy01234567 = _mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extractf128_si256(vacc01234567, 1));
+    __m128i vy89ABCDEF = _mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extractf128_si256(vacc89ABCDEF, 1));
+    __m128i vyGHIJKLMN = _mm_packs_epi32(_mm256_castsi256_si128(vaccGHIJKLMN), _mm256_extractf128_si256(vaccGHIJKLMN, 1));
+
+    vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
+    vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
+    vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
+
+    __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF);
+    vyGHIJKLMN = _mm_packus_epi16(vyGHIJKLMN, vyGHIJKLMN);
+
+    vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min);
+    vyGHIJKLMN = _mm_max_epu8(vyGHIJKLMN, voutput_min);
+
+    _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
+    _mm_storel_epi64((__m128i*) (y + 16), vyGHIJKLMN);
+    y += 24;
+  }
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    __m256 vx = _mm256_loadu_ps(x);
+    vx = _mm256_mul_ps(vx, vscale);
+    vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+    x += 8;
+
+    const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+    __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+    vy = _mm_adds_epi16(vy, voutput_zero_point);
+    vy = _mm_packus_epi16(vy, vy);
+    vy = _mm_max_epu8(vy, voutput_min);
+
+    _mm_storel_epi64((__m128i*) y, vy);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(float));
+    assert(n <= 7 * sizeof(float));
+    __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
+
+    __m256 vx = _mm256_maskload_ps(x, vmask);
+    vx = _mm256_mul_ps(vx, vscale);
+    vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+
+    const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+    __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+    vy = _mm_adds_epi16(vy, voutput_zero_point);
+    vy = _mm_packus_epi16(vy, vy);
+    vy = _mm_max_epu8(vy, voutput_min);
+
+    if (n & (4 * sizeof(float))) {
+      *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+      y += 4;
+      vy = _mm_srli_epi64(vy, 32);
+    }
+    if (n & (2 * sizeof(float))) {
+      *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+      y += 2;
+      vy = _mm_srli_epi32(vy, 16);
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = (uint8_t) _mm_extract_epi8(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-qu8-vcvt/gen/vcvt-avx-x32.c b/src/f32-qu8-vcvt/gen/vcvt-avx-x32.c
new file mode 100644
index 0000000..547bd75
--- /dev/null
+++ b/src/f32-qu8-vcvt/gen/vcvt-avx-x32.c
@@ -0,0 +1,122 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-qs8-vcvt/avx.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_f32_qu8_vcvt_ukernel__avx_x32(
+    size_t n,
+    const float* x,
+    uint8_t* y,
+    const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(x != NULL);
+  assert(y != NULL);
+
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
+  const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
+  const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
+
+  for (; n >= 32 * sizeof(float); n -= 32 * sizeof(float)) {
+    __m256 vx01234567 = _mm256_loadu_ps(x);
+    __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
+    __m256 vxGHIJKLMN = _mm256_loadu_ps(x + 16);
+    __m256 vxOPQRSTUV = _mm256_loadu_ps(x + 24);
+    x += 32;
+
+    vx01234567 = _mm256_mul_ps(vx01234567, vscale);
+    vx89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vscale);
+    vxGHIJKLMN = _mm256_mul_ps(vxGHIJKLMN, vscale);
+    vxOPQRSTUV = _mm256_mul_ps(vxOPQRSTUV, vscale);
+
+    vx01234567 = _mm256_min_ps(vx01234567, voutput_max_less_zero_point);
+    vx89ABCDEF = _mm256_min_ps(vx89ABCDEF, voutput_max_less_zero_point);
+    vxGHIJKLMN = _mm256_min_ps(vxGHIJKLMN, voutput_max_less_zero_point);
+    vxOPQRSTUV = _mm256_min_ps(vxOPQRSTUV, voutput_max_less_zero_point);
+
+    const __m256i vacc01234567 = _mm256_cvtps_epi32(vx01234567);
+    const __m256i vacc89ABCDEF = _mm256_cvtps_epi32(vx89ABCDEF);
+    const __m256i vaccGHIJKLMN = _mm256_cvtps_epi32(vxGHIJKLMN);
+    const __m256i vaccOPQRSTUV = _mm256_cvtps_epi32(vxOPQRSTUV);
+
+    __m128i vy01234567 = _mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extractf128_si256(vacc01234567, 1));
+    __m128i vy89ABCDEF = _mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extractf128_si256(vacc89ABCDEF, 1));
+    __m128i vyGHIJKLMN = _mm_packs_epi32(_mm256_castsi256_si128(vaccGHIJKLMN), _mm256_extractf128_si256(vaccGHIJKLMN, 1));
+    __m128i vyOPQRSTUV = _mm_packs_epi32(_mm256_castsi256_si128(vaccOPQRSTUV), _mm256_extractf128_si256(vaccOPQRSTUV, 1));
+
+    vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
+    vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
+    vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
+    vyOPQRSTUV = _mm_adds_epi16(vyOPQRSTUV, voutput_zero_point);
+
+    __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF);
+    __m128i vyGHIJKLMNOPQRSTUV = _mm_packus_epi16(vyGHIJKLMN, vyOPQRSTUV);
+
+    vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min);
+    vyGHIJKLMNOPQRSTUV = _mm_max_epu8(vyGHIJKLMNOPQRSTUV, voutput_min);
+
+    _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
+    _mm_storeu_si128((__m128i*) (y + 16), vyGHIJKLMNOPQRSTUV);
+    y += 32;
+  }
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    __m256 vx = _mm256_loadu_ps(x);
+    vx = _mm256_mul_ps(vx, vscale);
+    vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+    x += 8;
+
+    const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+    __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+    vy = _mm_adds_epi16(vy, voutput_zero_point);
+    vy = _mm_packus_epi16(vy, vy);
+    vy = _mm_max_epu8(vy, voutput_min);
+
+    _mm_storel_epi64((__m128i*) y, vy);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(float));
+    assert(n <= 7 * sizeof(float));
+    __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
+
+    __m256 vx = _mm256_maskload_ps(x, vmask);
+    vx = _mm256_mul_ps(vx, vscale);
+    vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+
+    const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+    __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+    vy = _mm_adds_epi16(vy, voutput_zero_point);
+    vy = _mm_packus_epi16(vy, vy);
+    vy = _mm_max_epu8(vy, voutput_min);
+
+    if (n & (4 * sizeof(float))) {
+      *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+      y += 4;
+      vy = _mm_srli_epi64(vy, 32);
+    }
+    if (n & (2 * sizeof(float))) {
+      *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+      y += 2;
+      vy = _mm_srli_epi32(vy, 16);
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = (uint8_t) _mm_extract_epi8(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-qu8-vcvt/gen/vcvt-avx-x8.c b/src/f32-qu8-vcvt/gen/vcvt-avx-x8.c
new file mode 100644
index 0000000..a75ede7
--- /dev/null
+++ b/src/f32-qu8-vcvt/gen/vcvt-avx-x8.c
@@ -0,0 +1,80 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-qs8-vcvt/avx.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_f32_qu8_vcvt_ukernel__avx_x8(
+    size_t n,
+    const float* x,
+    uint8_t* y,
+    const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(x != NULL);
+  assert(y != NULL);
+
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
+  const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
+  const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    __m256 vx = _mm256_loadu_ps(x);
+    vx = _mm256_mul_ps(vx, vscale);
+    vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+    x += 8;
+
+    const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+    __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+    vy = _mm_adds_epi16(vy, voutput_zero_point);
+    vy = _mm_packus_epi16(vy, vy);
+    vy = _mm_max_epu8(vy, voutput_min);
+
+    _mm_storel_epi64((__m128i*) y, vy);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(float));
+    assert(n <= 7 * sizeof(float));
+    __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
+
+    __m256 vx = _mm256_maskload_ps(x, vmask);
+    vx = _mm256_mul_ps(vx, vscale);
+    vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
+
+    const __m256i vacc = _mm256_cvtps_epi32(vx);
+
+    __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
+    vy = _mm_adds_epi16(vy, voutput_zero_point);
+    vy = _mm_packus_epi16(vy, vy);
+    vy = _mm_max_epu8(vy, voutput_min);
+
+    if (n & (4 * sizeof(float))) {
+      *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+      y += 4;
+      vy = _mm_srli_epi64(vy, 32);
+    }
+    if (n & (2 * sizeof(float))) {
+      *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+      y += 2;
+      vy = _mm_srli_epi32(vy, 16);
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = (uint8_t) _mm_extract_epi8(vy, 0);
+    }
+  }
+}
diff --git a/src/init.c b/src/init.c
index f99ad26..89ffd3f 100644
--- a/src/init.c
+++ b/src/init.c
@@ -3517,7 +3517,13 @@
       xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse2_int16_x32;
       xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse2_x16;
     }
-    if (cpuinfo_has_x86_sse4_1()) {
+    if (cpuinfo_has_x86_avx()) {
+      xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
+        .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx_x32,
+        .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx_params,
+        .element_tile = 32,
+      };
+    } else if (cpuinfo_has_x86_sse4_1()) {
       xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
         .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__sse41_x32,
         .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse4_params,
@@ -3530,11 +3536,19 @@
         .element_tile = 32,
       };
     }
-    xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
-      .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__sse2_x32,
-      .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_sse2_params,
-      .element_tile = 32,
-    };
+    if (cpuinfo_has_x86_avx()) {
+      xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
+        .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx_x32,
+        .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx_params,
+        .element_tile = 32,
+      };
+    } else {
+      xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
+        .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__sse2_x32,
+        .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_sse2_params,
+        .element_tile = 32,
+      };
+    }
     if (cpuinfo_has_x86_sse4_1()) {
       xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
         .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__sse41_x16,
diff --git a/src/params-init.c b/src/params-init.c
index 9c77f97..83d0f25 100644
--- a/src/params-init.c
+++ b/src/params-init.c
@@ -2831,6 +2831,32 @@
     params->sse4.output_min[i] = output_min;
   }
 }
+
+XNN_INTERNAL void xnn_init_f32_qs8_cvt_avx_params(
+  union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
+  float scale,
+  int8_t output_zero_point,
+  int8_t output_min,
+  int8_t output_max)
+{
+  const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
+  for (uint32_t i = 0; i < 8; i++) {
+    params->avx.scale[i] = scale;
+    params->avx.output_max_less_zero_point[i] = output_max_less_zero_point;
+  }
+  for (uint32_t i = 0; i < 8; i++) {
+    params->avx.output_zero_point[i] = (int16_t) output_zero_point;
+  }
+  for (uint32_t i = 0; i < 16; i++) {
+    params->avx.output_min[i] = output_min;
+  }
+  for (uint32_t i = 0; i < 7; i++) {
+    params->avx.mask_table[i] = -1;
+  }
+  for (uint32_t i = 7; i < 14; i++) {
+    params->avx.mask_table[i] = 0;
+  }
+}
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 #if XNN_ARCH_WASMSIMD
@@ -2954,6 +2980,32 @@
     params->sse2.output_min[i] = output_min;
   }
 }
+
+XNN_INTERNAL void xnn_init_f32_qu8_cvt_avx_params(
+  union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
+  float scale,
+  uint8_t output_zero_point,
+  uint8_t output_min,
+  uint8_t output_max)
+{
+  const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
+  for (uint32_t i = 0; i < 8; i++) {
+    params->avx.scale[i] = scale;
+    params->avx.output_max_less_zero_point[i] = output_max_less_zero_point;
+  }
+  for (uint32_t i = 0; i < 8; i++) {
+    params->avx.output_zero_point[i] = (int16_t) output_zero_point;
+  }
+  for (uint32_t i = 0; i < 16; i++) {
+    params->avx.output_min[i] = output_min;
+  }
+  for (uint32_t i = 0; i < 7; i++) {
+    params->avx.mask_table[i] = -1;
+  }
+  for (uint32_t i = 7; i < 14; i++) {
+    params->avx.mask_table[i] = 0;
+  }
+}
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 #if XNN_ARCH_WASMSIMD
diff --git a/src/xnnpack/params-init.h b/src/xnnpack/params-init.h
index bc05495..99d6b80 100644
--- a/src/xnnpack/params-init.h
+++ b/src/xnnpack/params-init.h
@@ -807,6 +807,13 @@
   int8_t output_zero_point,
   int8_t output_min,
   int8_t output_max);
+
+XNN_INTERNAL void xnn_init_f32_qs8_cvt_avx_params(
+  union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
+  float scale,
+  int8_t output_zero_point,
+  int8_t output_min,
+  int8_t output_max);
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 #if XNN_ARCH_WASMSIMD
@@ -862,6 +869,13 @@
   uint8_t output_zero_point,
   uint8_t output_min,
   uint8_t output_max);
+
+XNN_INTERNAL void xnn_init_f32_qu8_cvt_avx_params(
+  union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
+  float scale,
+  uint8_t output_zero_point,
+  uint8_t output_min,
+  uint8_t output_max);
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 #if XNN_ARCH_WASMSIMD
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index c8f22bc..55296d3 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -896,6 +896,13 @@
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) int8_t output_min[16];
   } sse4;
+  struct {
+    XNN_ALIGN(32) float scale[8];
+    XNN_ALIGN(32) float output_max_less_zero_point[8];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) int8_t output_min[16];
+    int32_t mask_table[14];
+  } avx;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD
   struct {
@@ -951,6 +958,13 @@
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) uint8_t output_min[16];
   } sse2;
+  struct {
+    XNN_ALIGN(32) float scale[8];
+    XNN_ALIGN(32) float output_max_less_zero_point[8];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) uint8_t output_min[16];
+    int32_t mask_table[14];
+  } avx;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD
   struct {
diff --git a/src/xnnpack/vcvt.h b/src/xnnpack/vcvt.h
index 4df995b..3422d43 100644
--- a/src/xnnpack/vcvt.h
+++ b/src/xnnpack/vcvt.h
@@ -285,6 +285,11 @@
 DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__sse41_x24)
 DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__sse41_x32)
 
+DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx_x8)
+DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx_x16)
+DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx_x24)
+DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx_x32)
+
 DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__wasmsimd_x8)
 DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__wasmsimd_x16)
 DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__wasmsimd_x24)
@@ -318,6 +323,11 @@
 DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__sse41_x24)
 DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__sse41_x32)
 
+DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx_x8)
+DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx_x16)
+DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx_x24)
+DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx_x32)
+
 DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__wasmsimd_x8)
 DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__wasmsimd_x16)
 DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__wasmsimd_x24)
diff --git a/test/f32-qs8-vcvt.cc b/test/f32-qs8-vcvt.cc
index 458bf7a..b8de37f 100644
--- a/test/f32-qs8-vcvt.cc
+++ b/test/f32-qs8-vcvt.cc
@@ -1985,6 +1985,498 @@
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_QS8_VCVT__AVX_X8, batch_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    VCvtMicrokernelTester()
+      .batch_size(8)
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X8, batch_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X8, batch_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X8, batch_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X8, scale) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(50)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X8, zero_point) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+      }
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X8, saturation) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(500)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X8, overflow) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(4294967296.0f)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X8, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t qmin = -128; qmin < 127; qmin += 51) {
+      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .scale(500)
+          .qmin(qmin)
+          .qmax(std::numeric_limits<int8_t>::max())
+          .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+      }
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X8, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t qmax = -127; qmax <= 127; qmax += 51) {
+      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .scale(500)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(qmax)
+          .Test(xnn_f32_qs8_vcvt_ukernel__avx_x8, xnn_init_f32_qs8_cvt_avx_params);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_QS8_VCVT__AVX_X16, batch_eq_16) {
+    TEST_REQUIRES_X86_AVX;
+    VCvtMicrokernelTester()
+      .batch_size(16)
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X16, batch_div_16) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X16, batch_lt_16) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X16, batch_gt_16) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X16, scale) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(50)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X16, zero_point) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+      }
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X16, saturation) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(500)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X16, overflow) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(4294967296.0f)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X16, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t qmin = -128; qmin < 127; qmin += 51) {
+      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .scale(500)
+          .qmin(qmin)
+          .qmax(std::numeric_limits<int8_t>::max())
+          .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+      }
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X16, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t qmax = -127; qmax <= 127; qmax += 51) {
+      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .scale(500)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(qmax)
+          .Test(xnn_f32_qs8_vcvt_ukernel__avx_x16, xnn_init_f32_qs8_cvt_avx_params);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_QS8_VCVT__AVX_X24, batch_eq_24) {
+    TEST_REQUIRES_X86_AVX;
+    VCvtMicrokernelTester()
+      .batch_size(24)
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X24, batch_div_24) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 48; batch_size < 240; batch_size += 24) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X24, batch_lt_24) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size < 24; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X24, batch_gt_24) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 25; batch_size < 48; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X24, scale) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(50)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X24, zero_point) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+      for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+      }
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X24, saturation) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(500)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X24, overflow) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(4294967296.0f)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X24, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t qmin = -128; qmin < 127; qmin += 51) {
+      for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .scale(500)
+          .qmin(qmin)
+          .qmax(std::numeric_limits<int8_t>::max())
+          .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+      }
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X24, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t qmax = -127; qmax <= 127; qmax += 51) {
+      for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .scale(500)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(qmax)
+          .Test(xnn_f32_qs8_vcvt_ukernel__avx_x24, xnn_init_f32_qs8_cvt_avx_params);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_QS8_VCVT__AVX_X32, batch_eq_32) {
+    TEST_REQUIRES_X86_AVX;
+    VCvtMicrokernelTester()
+      .batch_size(32)
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X32, batch_div_32) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X32, batch_lt_32) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X32, batch_gt_32) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X32, scale) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(50)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X32, zero_point) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+      }
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X32, saturation) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(500)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X32, overflow) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(4294967296.0f)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X32, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t qmin = -128; qmin < 127; qmin += 51) {
+      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .scale(500)
+          .qmin(qmin)
+          .qmax(std::numeric_limits<int8_t>::max())
+          .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+      }
+    }
+  }
+
+  TEST(F32_QS8_VCVT__AVX_X32, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t qmax = -127; qmax <= 127; qmax += 51) {
+      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .scale(500)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(qmax)
+          .Test(xnn_f32_qs8_vcvt_ukernel__avx_x32, xnn_init_f32_qs8_cvt_avx_params);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
 #if XNN_ARCH_WASMSIMD
   TEST(F32_QS8_VCVT__WASMSIMD_CVT_X8, batch_eq_8) {
     VCvtMicrokernelTester()
diff --git a/test/f32-qs8-vcvt.yaml b/test/f32-qs8-vcvt.yaml
index 674a79d..3937810 100644
--- a/test/f32-qs8-vcvt.yaml
+++ b/test/f32-qs8-vcvt.yaml
@@ -35,6 +35,14 @@
   init: xnn_init_f32_qs8_cvt_sse4_params
 - name: xnn_f32_qs8_vcvt_ukernel__sse41_x32
   init: xnn_init_f32_qs8_cvt_sse4_params
+- name: xnn_f32_qs8_vcvt_ukernel__avx_x8
+  init: xnn_init_f32_qs8_cvt_avx_params
+- name: xnn_f32_qs8_vcvt_ukernel__avx_x16
+  init: xnn_init_f32_qs8_cvt_avx_params
+- name: xnn_f32_qs8_vcvt_ukernel__avx_x24
+  init: xnn_init_f32_qs8_cvt_avx_params
+- name: xnn_f32_qs8_vcvt_ukernel__avx_x32
+  init: xnn_init_f32_qs8_cvt_avx_params
 - name: xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_x8
   init: xnn_init_f32_qs8_cvt_wasmsimd_cvt_params
 - name: xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_x16
diff --git a/test/f32-qu8-vcvt.cc b/test/f32-qu8-vcvt.cc
index 7d6e41f..b604d30 100644
--- a/test/f32-qu8-vcvt.cc
+++ b/test/f32-qu8-vcvt.cc
@@ -1541,6 +1541,514 @@
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_QU8_VCVT__AVX_X8, batch_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    VCvtMicrokernelTester()
+      .batch_size(8)
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
+      .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X8, batch_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X8, batch_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X8, batch_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X8, scale) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(50)
+        .zero_point(100)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X8, zero_point) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
+          .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+      }
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X8, saturation) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(500)
+        .zero_point(128)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X8, overflow) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(4294967296.0f)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X8, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t qmin = 0; qmin < 255; qmin += 51) {
+      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .scale(500)
+          .zero_point(128)
+          .qmin(qmin)
+          .qmax(std::numeric_limits<uint8_t>::max())
+          .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+      }
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X8, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t qmax = 1; qmax <= 255; qmax += 51) {
+      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .scale(500)
+          .zero_point(128)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(qmax)
+          .Test(xnn_f32_qu8_vcvt_ukernel__avx_x8, xnn_init_f32_qu8_cvt_avx_params);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_QU8_VCVT__AVX_X16, batch_eq_16) {
+    TEST_REQUIRES_X86_AVX;
+    VCvtMicrokernelTester()
+      .batch_size(16)
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
+      .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X16, batch_div_16) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X16, batch_lt_16) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X16, batch_gt_16) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X16, scale) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(50)
+        .zero_point(100)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X16, zero_point) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
+          .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+      }
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X16, saturation) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(500)
+        .zero_point(128)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X16, overflow) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(4294967296.0f)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X16, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t qmin = 0; qmin < 255; qmin += 51) {
+      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .scale(500)
+          .zero_point(128)
+          .qmin(qmin)
+          .qmax(std::numeric_limits<uint8_t>::max())
+          .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+      }
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X16, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t qmax = 1; qmax <= 255; qmax += 51) {
+      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .scale(500)
+          .zero_point(128)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(qmax)
+          .Test(xnn_f32_qu8_vcvt_ukernel__avx_x16, xnn_init_f32_qu8_cvt_avx_params);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_QU8_VCVT__AVX_X24, batch_eq_24) {
+    TEST_REQUIRES_X86_AVX;
+    VCvtMicrokernelTester()
+      .batch_size(24)
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
+      .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X24, batch_div_24) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 48; batch_size < 240; batch_size += 24) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X24, batch_lt_24) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size < 24; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X24, batch_gt_24) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 25; batch_size < 48; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X24, scale) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(50)
+        .zero_point(100)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X24, zero_point) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+      for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
+          .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+      }
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X24, saturation) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(500)
+        .zero_point(128)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X24, overflow) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(4294967296.0f)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X24, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t qmin = 0; qmin < 255; qmin += 51) {
+      for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .scale(500)
+          .zero_point(128)
+          .qmin(qmin)
+          .qmax(std::numeric_limits<uint8_t>::max())
+          .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+      }
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X24, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t qmax = 1; qmax <= 255; qmax += 51) {
+      for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .scale(500)
+          .zero_point(128)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(qmax)
+          .Test(xnn_f32_qu8_vcvt_ukernel__avx_x24, xnn_init_f32_qu8_cvt_avx_params);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_QU8_VCVT__AVX_X32, batch_eq_32) {
+    TEST_REQUIRES_X86_AVX;
+    VCvtMicrokernelTester()
+      .batch_size(32)
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
+      .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X32, batch_div_32) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X32, batch_lt_32) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X32, batch_gt_32) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X32, scale) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(50)
+        .zero_point(100)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X32, zero_point) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
+          .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+      }
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X32, saturation) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(500)
+        .zero_point(128)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X32, overflow) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(4294967296.0f)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
+        .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X32, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t qmin = 0; qmin < 255; qmin += 51) {
+      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .scale(500)
+          .zero_point(128)
+          .qmin(qmin)
+          .qmax(std::numeric_limits<uint8_t>::max())
+          .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+      }
+    }
+  }
+
+  TEST(F32_QU8_VCVT__AVX_X32, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (int16_t qmax = 1; qmax <= 255; qmax += 51) {
+      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .scale(500)
+          .zero_point(128)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(qmax)
+          .Test(xnn_f32_qu8_vcvt_ukernel__avx_x32, xnn_init_f32_qu8_cvt_avx_params);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
 #if XNN_ARCH_WASMSIMD
   TEST(F32_QU8_VCVT__WASMSIMD_CVT_X8, batch_eq_8) {
     VCvtMicrokernelTester()
diff --git a/test/f32-qu8-vcvt.yaml b/test/f32-qu8-vcvt.yaml
index e00c709..f31df03 100644
--- a/test/f32-qu8-vcvt.yaml
+++ b/test/f32-qu8-vcvt.yaml
@@ -27,6 +27,14 @@
   init: xnn_init_f32_qu8_cvt_sse2_params
 - name: xnn_f32_qu8_vcvt_ukernel__sse2_x32
   init: xnn_init_f32_qu8_cvt_sse2_params
+- name: xnn_f32_qu8_vcvt_ukernel__avx_x8
+  init: xnn_init_f32_qu8_cvt_avx_params
+- name: xnn_f32_qu8_vcvt_ukernel__avx_x16
+  init: xnn_init_f32_qu8_cvt_avx_params
+- name: xnn_f32_qu8_vcvt_ukernel__avx_x24
+  init: xnn_init_f32_qu8_cvt_avx_params
+- name: xnn_f32_qu8_vcvt_ukernel__avx_x32
+  init: xnn_init_f32_qu8_cvt_avx_params
 - name: xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_x8
   init: xnn_init_f32_qu8_cvt_wasmsimd_cvt_params
 - name: xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_x16