AVX2 QS8->F32 and QU8->F32 VCVT microkernels

PiperOrigin-RevId: 416487347
diff --git a/BUILD.bazel b/BUILD.bazel
index d2a62cb..5092e73 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -5649,6 +5649,7 @@
     "src/qc8-igemm/gen/3x8c8-minmax-fp32-avx2.c",
     "src/qs8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c",
     "src/qs8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c",
+    "src/qs8-f32-vcvt/gen/vcvt-avx2-x16.c",
     "src/qs8-gemm/gen/1x8c8-minmax-fp32-avx2.c",
     "src/qs8-gemm/gen/3x8c8-minmax-fp32-avx2.c",
     "src/qs8-igemm/gen/1x8c8-minmax-fp32-avx2.c",
@@ -5657,6 +5658,7 @@
     "src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c",
     "src/qu8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c",
     "src/qu8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c",
+    "src/qu8-f32-vcvt/gen/vcvt-avx2-x16.c",
     "src/qu8-gemm/gen/1x8c8-minmax-fp32-avx2.c",
     "src/qu8-gemm/gen/3x8c8-minmax-fp32-avx2.c",
     "src/qu8-igemm/gen/1x8c8-minmax-fp32-avx2.c",
@@ -5877,6 +5879,10 @@
     "src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16-vpmovsx.c",
     "src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16-vpunpck.c",
     "src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul32.c",
+    "src/qs8-f32-vcvt/gen/vcvt-avx2-x8.c",
+    "src/qs8-f32-vcvt/gen/vcvt-avx2-x16.c",
+    "src/qs8-f32-vcvt/gen/vcvt-avx2-x24.c",
+    "src/qs8-f32-vcvt/gen/vcvt-avx2-x32.c",
     "src/qs8-gemm/gen/1x8c8-minmax-fp32-avx2.c",
     "src/qs8-gemm/gen/1x8c8-xw-minmax-fp32-avx2.c",
     "src/qs8-gemm/gen/2x8c8-minmax-fp32-avx2.c",
@@ -5900,6 +5906,10 @@
     "src/qu8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c",
     "src/qu8-dwconv/gen/up32x9-minmax-fp32-avx2-mul32.c",
     "src/qu8-dwconv/gen/up32x25-minmax-fp32-avx2-mul32.c",
+    "src/qu8-f32-vcvt/gen/vcvt-avx2-x8.c",
+    "src/qu8-f32-vcvt/gen/vcvt-avx2-x16.c",
+    "src/qu8-f32-vcvt/gen/vcvt-avx2-x24.c",
+    "src/qu8-f32-vcvt/gen/vcvt-avx2-x32.c",
     "src/qu8-gemm/gen/1x8c8-minmax-fp32-avx2.c",
     "src/qu8-gemm/gen/2x8c8-minmax-fp32-avx2.c",
     "src/qu8-gemm/gen/3x8c8-minmax-fp32-avx2.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3cc6206..42ca9c5 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4577,6 +4577,7 @@
   src/qc8-igemm/gen/3x8c8-minmax-fp32-avx2.c
   src/qs8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c
   src/qs8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c
+  src/qs8-f32-vcvt/gen/vcvt-avx2-x16.c
   src/qs8-gemm/gen/1x8c8-minmax-fp32-avx2.c
   src/qs8-gemm/gen/3x8c8-minmax-fp32-avx2.c
   src/qs8-igemm/gen/1x8c8-minmax-fp32-avx2.c
@@ -4585,6 +4586,7 @@
   src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c
   src/qu8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c
   src/qu8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c
+  src/qu8-f32-vcvt/gen/vcvt-avx2-x16.c
   src/qu8-gemm/gen/1x8c8-minmax-fp32-avx2.c
   src/qu8-gemm/gen/3x8c8-minmax-fp32-avx2.c
   src/qu8-igemm/gen/1x8c8-minmax-fp32-avx2.c
@@ -4806,6 +4808,10 @@
   src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16-vpmovsx.c
   src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16-vpunpck.c
   src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul32.c
+  src/qs8-f32-vcvt/gen/vcvt-avx2-x8.c
+  src/qs8-f32-vcvt/gen/vcvt-avx2-x16.c
+  src/qs8-f32-vcvt/gen/vcvt-avx2-x24.c
+  src/qs8-f32-vcvt/gen/vcvt-avx2-x32.c
   src/qs8-gemm/gen/1x8c8-minmax-fp32-avx2.c
   src/qs8-gemm/gen/1x8c8-xw-minmax-fp32-avx2.c
   src/qs8-gemm/gen/2x8c8-minmax-fp32-avx2.c
@@ -4829,6 +4835,10 @@
   src/qu8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c
   src/qu8-dwconv/gen/up32x9-minmax-fp32-avx2-mul32.c
   src/qu8-dwconv/gen/up32x25-minmax-fp32-avx2-mul32.c
+  src/qu8-f32-vcvt/gen/vcvt-avx2-x8.c
+  src/qu8-f32-vcvt/gen/vcvt-avx2-x16.c
+  src/qu8-f32-vcvt/gen/vcvt-avx2-x24.c
+  src/qu8-f32-vcvt/gen/vcvt-avx2-x32.c
   src/qu8-gemm/gen/1x8c8-minmax-fp32-avx2.c
   src/qu8-gemm/gen/2x8c8-minmax-fp32-avx2.c
   src/qu8-gemm/gen/3x8c8-minmax-fp32-avx2.c
diff --git a/bench/qs8-f32-vcvt.cc b/bench/qs8-f32-vcvt.cc
index 6830731..bb8ad27 100644
--- a/bench/qs8-f32-vcvt.cc
+++ b/bench/qs8-f32-vcvt.cc
@@ -93,6 +93,31 @@
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  BENCHMARK_CAPTURE(qs8_f32_vcvt, avx2_x8,
+                    xnn_qs8_f32_vcvt_ukernel__avx2_x8,
+                    xnn_init_qs8_f32_cvt_avx_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qs8_f32_vcvt, avx2_x16,
+                    xnn_qs8_f32_vcvt_ukernel__avx2_x16,
+                    xnn_init_qs8_f32_cvt_avx_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qs8_f32_vcvt, avx2_x24,
+                    xnn_qs8_f32_vcvt_ukernel__avx2_x24,
+                    xnn_init_qs8_f32_cvt_avx_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qs8_f32_vcvt, avx2_x32,
+                    xnn_qs8_f32_vcvt_ukernel__avx2_x32,
+                    xnn_init_qs8_f32_cvt_avx_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+    ->UseRealTime();
+
   BENCHMARK_CAPTURE(qs8_f32_vcvt, avx_x8,
                     xnn_qs8_f32_vcvt_ukernel__avx_x8,
                     xnn_init_qs8_f32_cvt_avx_params,
diff --git a/bench/qu8-f32-vcvt.cc b/bench/qu8-f32-vcvt.cc
index 43e62bc..9400471 100644
--- a/bench/qu8-f32-vcvt.cc
+++ b/bench/qu8-f32-vcvt.cc
@@ -93,6 +93,31 @@
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  BENCHMARK_CAPTURE(qu8_f32_vcvt, avx2_x8,
+                    xnn_qu8_f32_vcvt_ukernel__avx2_x8,
+                    xnn_init_qu8_f32_cvt_avx_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_f32_vcvt, avx2_x16,
+                    xnn_qu8_f32_vcvt_ukernel__avx2_x16,
+                    xnn_init_qu8_f32_cvt_avx_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_f32_vcvt, avx2_x24,
+                    xnn_qu8_f32_vcvt_ukernel__avx2_x24,
+                    xnn_init_qu8_f32_cvt_avx_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_f32_vcvt, avx2_x32,
+                    xnn_qu8_f32_vcvt_ukernel__avx2_x32,
+                    xnn_init_qu8_f32_cvt_avx_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
+    ->UseRealTime();
+
   BENCHMARK_CAPTURE(qu8_f32_vcvt, avx_x8,
                     xnn_qu8_f32_vcvt_ukernel__avx_x8,
                     xnn_init_qu8_f32_cvt_avx_params,
diff --git a/scripts/generate-qs8-f32-vcvt.sh b/scripts/generate-qs8-f32-vcvt.sh
index c355300..c7be7de 100755
--- a/scripts/generate-qs8-f32-vcvt.sh
+++ b/scripts/generate-qs8-f32-vcvt.sh
@@ -47,6 +47,16 @@
 tools/xngen src/qs8-f32-vcvt/avx.c.in -D BATCH_TILE=24 -D DATATYPE=QU8 -o src/qu8-f32-vcvt/gen/vcvt-avx-x24.c &
 tools/xngen src/qs8-f32-vcvt/avx.c.in -D BATCH_TILE=32 -D DATATYPE=QU8 -o src/qu8-f32-vcvt/gen/vcvt-avx-x32.c &
 
+tools/xngen src/qs8-f32-vcvt/avx2.c.in -D BATCH_TILE=8  -D DATATYPE=QS8 -o src/qs8-f32-vcvt/gen/vcvt-avx2-x8.c &
+tools/xngen src/qs8-f32-vcvt/avx2.c.in -D BATCH_TILE=16 -D DATATYPE=QS8 -o src/qs8-f32-vcvt/gen/vcvt-avx2-x16.c &
+tools/xngen src/qs8-f32-vcvt/avx2.c.in -D BATCH_TILE=24 -D DATATYPE=QS8 -o src/qs8-f32-vcvt/gen/vcvt-avx2-x24.c &
+tools/xngen src/qs8-f32-vcvt/avx2.c.in -D BATCH_TILE=32 -D DATATYPE=QS8 -o src/qs8-f32-vcvt/gen/vcvt-avx2-x32.c &
+
+tools/xngen src/qs8-f32-vcvt/avx2.c.in -D BATCH_TILE=8  -D DATATYPE=QU8 -o src/qu8-f32-vcvt/gen/vcvt-avx2-x8.c &
+tools/xngen src/qs8-f32-vcvt/avx2.c.in -D BATCH_TILE=16 -D DATATYPE=QU8 -o src/qu8-f32-vcvt/gen/vcvt-avx2-x16.c &
+tools/xngen src/qs8-f32-vcvt/avx2.c.in -D BATCH_TILE=24 -D DATATYPE=QU8 -o src/qu8-f32-vcvt/gen/vcvt-avx2-x24.c &
+tools/xngen src/qs8-f32-vcvt/avx2.c.in -D BATCH_TILE=32 -D DATATYPE=QU8 -o src/qu8-f32-vcvt/gen/vcvt-avx2-x32.c &
+
 ################################## WAsm SIMD ##################################
 tools/xngen src/qs8-f32-vcvt/wasmsimd.c.in -D BATCH_TILE=8  -D DATATYPE=QS8 -o src/qs8-f32-vcvt/gen/vcvt-wasmsimd-x8.c &
 tools/xngen src/qs8-f32-vcvt/wasmsimd.c.in -D BATCH_TILE=16 -D DATATYPE=QS8 -o src/qs8-f32-vcvt/gen/vcvt-wasmsimd-x16.c &
diff --git a/src/init.c b/src/init.c
index ea8a690..bbdbd21 100644
--- a/src/init.c
+++ b/src/init.c
@@ -3573,7 +3573,18 @@
         .element_tile = 32,
       };
     }
-    if (cpuinfo_has_x86_avx()) {
+    if (cpuinfo_has_x86_avx2()) {
+      xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
+        .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx2_x16,
+        .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
+        .element_tile = 16,
+      };
+      xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
+        .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx2_x16,
+        .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params,
+        .element_tile = 16,
+      };
+    } else if (cpuinfo_has_x86_avx()) {
       xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
         .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx_x32,
         .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
diff --git a/src/params-init.c b/src/params-init.c
index 16c02a0..bcf0cb0 100644
--- a/src/params-init.c
+++ b/src/params-init.c
@@ -3262,10 +3262,8 @@
   float scale,
   int8_t zero_point)
 {
-  for (uint32_t i = 0; i < 4; i++) {
-    params->avx.minus_zero_point[i] = -(int32_t) zero_point;
-  }
   for (uint32_t i = 0; i < 8; i++) {
+    params->avx.minus_zero_point[i] = -(int32_t) zero_point;
     params->avx.scale[i] = scale;
   }
 }
@@ -3339,10 +3337,8 @@
   float scale,
   uint8_t zero_point)
 {
-  for (uint32_t i = 0; i < 4; i++) {
-    params->avx.minus_zero_point[i] = -(int32_t) zero_point;
-  }
   for (uint32_t i = 0; i < 8; i++) {
+    params->avx.minus_zero_point[i] = -(int32_t) zero_point;
     params->avx.scale[i] = scale;
   }
 }
diff --git a/src/qs8-f32-vcvt/avx2.c.in b/src/qs8-f32-vcvt/avx2.c.in
new file mode 100644
index 0000000..cf94031
--- /dev/null
+++ b/src/qs8-f32-vcvt/avx2.c.in
@@ -0,0 +1,90 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert BATCH_TILE % 8 == 0
+$assert BATCH_TILE >= 8
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vcvt.h>
+
+
+$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
+$_MM256_CVTEPX8_EPI32 = {"QS8": "_mm256_cvtepi8_epi32", "QU8": "_mm256_cvtepu8_epi32"}[DATATYPE]
+$_MM_CVTEPX8_EPI32 = {"QS8": "_mm_cvtepi8_epi32", "QU8": "_mm_cvtepu8_epi32"}[DATATYPE]
+void xnn_${DATATYPE.lower()}_f32_vcvt_ukernel__avx2_x${BATCH_TILE}(
+    size_t n,
+    const ${XINT8_T}* x,
+    float* y,
+    const union xnn_${DATATYPE.lower()}_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(${XINT8_T}) == 0);
+  assert(x != NULL);
+  assert(y != NULL);
+
+  const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  for (; n >= ${BATCH_TILE} * sizeof(${XINT8_T}); n -= ${BATCH_TILE} * sizeof(${XINT8_T})) {
+    __m256i vx${ABC[0:8]} = ${_MM256_CVTEPX8_EPI32}(_mm_loadl_epi64((const __m128i*) x));
+    $for N in range(8, BATCH_TILE, 8):
+      __m256i vx${ABC[N:N+8]} = ${_MM256_CVTEPX8_EPI32}(_mm_loadl_epi64((const __m128i*) (x + ${N})));
+    x += ${BATCH_TILE};
+
+    $for N in range(0, BATCH_TILE, 8):
+      vx${ABC[N:N+8]} = _mm256_add_epi32(vx${ABC[N:N+8]}, vminus_zero_point);
+
+    $for N in range(0, BATCH_TILE, 8):
+      __m256 vy${ABC[N:N+8]} = _mm256_cvtepi32_ps(vx${ABC[N:N+8]});
+
+    $for N in range(0, BATCH_TILE, 8):
+      vy${ABC[N:N+8]} = _mm256_mul_ps(vy${ABC[N:N+8]}, vscale);
+
+    _mm256_storeu_ps(y, vy${ABC[0:8]});
+    $for N in range(8, BATCH_TILE, 8):
+      _mm256_storeu_ps(y + ${N}, vy${ABC[N:N+8]});
+    y += ${BATCH_TILE};
+  }
+  for (; n >= 8 * sizeof(${XINT8_T}); n -= 8 * sizeof(${XINT8_T})) {
+    __m256i vx = ${_MM256_CVTEPX8_EPI32}(_mm_loadl_epi64((const __m128i*) x));
+    vx = _mm256_add_epi32(vx, vminus_zero_point);
+    x += 8;
+
+    __m256 vy = _mm256_cvtepi32_ps(vx);
+    vy = _mm256_mul_ps(vy, vscale);
+
+    _mm256_storeu_ps(y, vy);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(${XINT8_T}));
+    assert(n <= 7 * sizeof(${XINT8_T}));
+
+    __m256i vx = ${_MM256_CVTEPX8_EPI32}(_mm_loadl_epi64((const __m128i*) x));
+    vx = _mm256_add_epi32(vx, vminus_zero_point);
+
+    __m256 vy = _mm256_cvtepi32_ps(vx);
+    vy = _mm256_mul_ps(vy, vscale);
+
+    __m128 vy_lo = _mm256_castps256_ps128(vy);
+    if (n & (4 * sizeof(${XINT8_T}))) {
+      _mm_storeu_ps(y, vy_lo);
+      vy_lo = _mm256_extractf128_ps(vy, 1);
+      y += 4;
+    }
+    if (n & (2 * sizeof(${XINT8_T}))) {
+      _mm_storel_pi((__m64*) y, vy_lo);
+      vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
+      y += 2;
+    }
+    if (n & (1 * sizeof(${XINT8_T}))) {
+      _mm_store_ss(y, vy_lo);
+    }
+  }
+}
diff --git a/src/qs8-f32-vcvt/gen/vcvt-avx2-x16.c b/src/qs8-f32-vcvt/gen/vcvt-avx2-x16.c
new file mode 100644
index 0000000..09e4f57
--- /dev/null
+++ b/src/qs8-f32-vcvt/gen/vcvt-avx2-x16.c
@@ -0,0 +1,86 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-f32-vcvt/avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qs8_f32_vcvt_ukernel__avx2_x16(
+    size_t n,
+    const int8_t* x,
+    float* y,
+    const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(int8_t) == 0);
+  assert(x != NULL);
+  assert(y != NULL);
+
+  const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
+    __m256i vx01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    __m256i vx89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (x + 8)));
+    x += 16;
+
+    vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
+    vx89ABCDEF = _mm256_add_epi32(vx89ABCDEF, vminus_zero_point);
+
+    __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
+    __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
+
+    vy01234567 = _mm256_mul_ps(vy01234567, vscale);
+    vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
+
+    _mm256_storeu_ps(y, vy01234567);
+    _mm256_storeu_ps(y + 8, vy89ABCDEF);
+    y += 16;
+  }
+  for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+    __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    vx = _mm256_add_epi32(vx, vminus_zero_point);
+    x += 8;
+
+    __m256 vy = _mm256_cvtepi32_ps(vx);
+    vy = _mm256_mul_ps(vy, vscale);
+
+    _mm256_storeu_ps(y, vy);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(int8_t));
+    assert(n <= 7 * sizeof(int8_t));
+
+    __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    vx = _mm256_add_epi32(vx, vminus_zero_point);
+
+    __m256 vy = _mm256_cvtepi32_ps(vx);
+    vy = _mm256_mul_ps(vy, vscale);
+
+    __m128 vy_lo = _mm256_castps256_ps128(vy);
+    if (n & (4 * sizeof(int8_t))) {
+      _mm_storeu_ps(y, vy_lo);
+      vy_lo = _mm256_extractf128_ps(vy, 1);
+      y += 4;
+    }
+    if (n & (2 * sizeof(int8_t))) {
+      _mm_storel_pi((__m64*) y, vy_lo);
+      vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
+      y += 2;
+    }
+    if (n & (1 * sizeof(int8_t))) {
+      _mm_store_ss(y, vy_lo);
+    }
+  }
+}
diff --git a/src/qs8-f32-vcvt/gen/vcvt-avx2-x24.c b/src/qs8-f32-vcvt/gen/vcvt-avx2-x24.c
new file mode 100644
index 0000000..a433dd9
--- /dev/null
+++ b/src/qs8-f32-vcvt/gen/vcvt-avx2-x24.c
@@ -0,0 +1,91 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-f32-vcvt/avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qs8_f32_vcvt_ukernel__avx2_x24(
+    size_t n,
+    const int8_t* x,
+    float* y,
+    const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(int8_t) == 0);
+  assert(x != NULL);
+  assert(y != NULL);
+
+  const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  for (; n >= 24 * sizeof(int8_t); n -= 24 * sizeof(int8_t)) {
+    __m256i vx01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    __m256i vx89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (x + 8)));
+    __m256i vxGHIJKLMN = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (x + 16)));
+    x += 24;
+
+    vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
+    vx89ABCDEF = _mm256_add_epi32(vx89ABCDEF, vminus_zero_point);
+    vxGHIJKLMN = _mm256_add_epi32(vxGHIJKLMN, vminus_zero_point);
+
+    __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
+    __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
+    __m256 vyGHIJKLMN = _mm256_cvtepi32_ps(vxGHIJKLMN);
+
+    vy01234567 = _mm256_mul_ps(vy01234567, vscale);
+    vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
+    vyGHIJKLMN = _mm256_mul_ps(vyGHIJKLMN, vscale);
+
+    _mm256_storeu_ps(y, vy01234567);
+    _mm256_storeu_ps(y + 8, vy89ABCDEF);
+    _mm256_storeu_ps(y + 16, vyGHIJKLMN);
+    y += 24;
+  }
+  for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+    __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    vx = _mm256_add_epi32(vx, vminus_zero_point);
+    x += 8;
+
+    __m256 vy = _mm256_cvtepi32_ps(vx);
+    vy = _mm256_mul_ps(vy, vscale);
+
+    _mm256_storeu_ps(y, vy);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(int8_t));
+    assert(n <= 7 * sizeof(int8_t));
+
+    __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    vx = _mm256_add_epi32(vx, vminus_zero_point);
+
+    __m256 vy = _mm256_cvtepi32_ps(vx);
+    vy = _mm256_mul_ps(vy, vscale);
+
+    __m128 vy_lo = _mm256_castps256_ps128(vy);
+    if (n & (4 * sizeof(int8_t))) {
+      _mm_storeu_ps(y, vy_lo);
+      vy_lo = _mm256_extractf128_ps(vy, 1);
+      y += 4;
+    }
+    if (n & (2 * sizeof(int8_t))) {
+      _mm_storel_pi((__m64*) y, vy_lo);
+      vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
+      y += 2;
+    }
+    if (n & (1 * sizeof(int8_t))) {
+      _mm_store_ss(y, vy_lo);
+    }
+  }
+}
diff --git a/src/qs8-f32-vcvt/gen/vcvt-avx2-x32.c b/src/qs8-f32-vcvt/gen/vcvt-avx2-x32.c
new file mode 100644
index 0000000..19d3940
--- /dev/null
+++ b/src/qs8-f32-vcvt/gen/vcvt-avx2-x32.c
@@ -0,0 +1,96 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-f32-vcvt/avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qs8_f32_vcvt_ukernel__avx2_x32(
+    size_t n,
+    const int8_t* x,
+    float* y,
+    const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(int8_t) == 0);
+  assert(x != NULL);
+  assert(y != NULL);
+
+  const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
+    __m256i vx01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    __m256i vx89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (x + 8)));
+    __m256i vxGHIJKLMN = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (x + 16)));
+    __m256i vxOPQRSTUV = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (x + 24)));
+    x += 32;
+
+    vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
+    vx89ABCDEF = _mm256_add_epi32(vx89ABCDEF, vminus_zero_point);
+    vxGHIJKLMN = _mm256_add_epi32(vxGHIJKLMN, vminus_zero_point);
+    vxOPQRSTUV = _mm256_add_epi32(vxOPQRSTUV, vminus_zero_point);
+
+    __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
+    __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
+    __m256 vyGHIJKLMN = _mm256_cvtepi32_ps(vxGHIJKLMN);
+    __m256 vyOPQRSTUV = _mm256_cvtepi32_ps(vxOPQRSTUV);
+
+    vy01234567 = _mm256_mul_ps(vy01234567, vscale);
+    vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
+    vyGHIJKLMN = _mm256_mul_ps(vyGHIJKLMN, vscale);
+    vyOPQRSTUV = _mm256_mul_ps(vyOPQRSTUV, vscale);
+
+    _mm256_storeu_ps(y, vy01234567);
+    _mm256_storeu_ps(y + 8, vy89ABCDEF);
+    _mm256_storeu_ps(y + 16, vyGHIJKLMN);
+    _mm256_storeu_ps(y + 24, vyOPQRSTUV);
+    y += 32;
+  }
+  for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+    __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    vx = _mm256_add_epi32(vx, vminus_zero_point);
+    x += 8;
+
+    __m256 vy = _mm256_cvtepi32_ps(vx);
+    vy = _mm256_mul_ps(vy, vscale);
+
+    _mm256_storeu_ps(y, vy);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(int8_t));
+    assert(n <= 7 * sizeof(int8_t));
+
+    __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    vx = _mm256_add_epi32(vx, vminus_zero_point);
+
+    __m256 vy = _mm256_cvtepi32_ps(vx);
+    vy = _mm256_mul_ps(vy, vscale);
+
+    __m128 vy_lo = _mm256_castps256_ps128(vy);
+    if (n & (4 * sizeof(int8_t))) {
+      _mm_storeu_ps(y, vy_lo);
+      vy_lo = _mm256_extractf128_ps(vy, 1);
+      y += 4;
+    }
+    if (n & (2 * sizeof(int8_t))) {
+      _mm_storel_pi((__m64*) y, vy_lo);
+      vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
+      y += 2;
+    }
+    if (n & (1 * sizeof(int8_t))) {
+      _mm_store_ss(y, vy_lo);
+    }
+  }
+}
diff --git a/src/qs8-f32-vcvt/gen/vcvt-avx2-x8.c b/src/qs8-f32-vcvt/gen/vcvt-avx2-x8.c
new file mode 100644
index 0000000..aafea27
--- /dev/null
+++ b/src/qs8-f32-vcvt/gen/vcvt-avx2-x8.c
@@ -0,0 +1,81 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-f32-vcvt/avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qs8_f32_vcvt_ukernel__avx2_x8(
+    size_t n,
+    const int8_t* x,
+    float* y,
+    const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(int8_t) == 0);
+  assert(x != NULL);
+  assert(y != NULL);
+
+  const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+    __m256i vx01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    x += 8;
+
+    vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
+
+    __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
+
+    vy01234567 = _mm256_mul_ps(vy01234567, vscale);
+
+    _mm256_storeu_ps(y, vy01234567);
+    y += 8;
+  }
+  for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+    __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    vx = _mm256_add_epi32(vx, vminus_zero_point);
+    x += 8;
+
+    __m256 vy = _mm256_cvtepi32_ps(vx);
+    vy = _mm256_mul_ps(vy, vscale);
+
+    _mm256_storeu_ps(y, vy);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(int8_t));
+    assert(n <= 7 * sizeof(int8_t));
+
+    __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    vx = _mm256_add_epi32(vx, vminus_zero_point);
+
+    __m256 vy = _mm256_cvtepi32_ps(vx);
+    vy = _mm256_mul_ps(vy, vscale);
+
+    __m128 vy_lo = _mm256_castps256_ps128(vy);
+    if (n & (4 * sizeof(int8_t))) {
+      _mm_storeu_ps(y, vy_lo);
+      vy_lo = _mm256_extractf128_ps(vy, 1);
+      y += 4;
+    }
+    if (n & (2 * sizeof(int8_t))) {
+      _mm_storel_pi((__m64*) y, vy_lo);
+      vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
+      y += 2;
+    }
+    if (n & (1 * sizeof(int8_t))) {
+      _mm_store_ss(y, vy_lo);
+    }
+  }
+}
diff --git a/src/qu8-f32-vcvt/gen/vcvt-avx2-x16.c b/src/qu8-f32-vcvt/gen/vcvt-avx2-x16.c
new file mode 100644
index 0000000..62deb0f
--- /dev/null
+++ b/src/qu8-f32-vcvt/gen/vcvt-avx2-x16.c
@@ -0,0 +1,86 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-f32-vcvt/avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qu8_f32_vcvt_ukernel__avx2_x16(
+    size_t n,
+    const uint8_t* x,
+    float* y,
+    const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(uint8_t) == 0);
+  assert(x != NULL);
+  assert(y != NULL);
+
+  const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+    __m256i vx01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    __m256i vx89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (x + 8)));
+    x += 16;
+
+    vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
+    vx89ABCDEF = _mm256_add_epi32(vx89ABCDEF, vminus_zero_point);
+
+    __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
+    __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
+
+    vy01234567 = _mm256_mul_ps(vy01234567, vscale);
+    vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
+
+    _mm256_storeu_ps(y, vy01234567);
+    _mm256_storeu_ps(y + 8, vy89ABCDEF);
+    y += 16;
+  }
+  for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+    __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    vx = _mm256_add_epi32(vx, vminus_zero_point);
+    x += 8;
+
+    __m256 vy = _mm256_cvtepi32_ps(vx);
+    vy = _mm256_mul_ps(vy, vscale);
+
+    _mm256_storeu_ps(y, vy);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(uint8_t));
+    assert(n <= 7 * sizeof(uint8_t));
+
+    __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    vx = _mm256_add_epi32(vx, vminus_zero_point);
+
+    __m256 vy = _mm256_cvtepi32_ps(vx);
+    vy = _mm256_mul_ps(vy, vscale);
+
+    __m128 vy_lo = _mm256_castps256_ps128(vy);
+    if (n & (4 * sizeof(uint8_t))) {
+      _mm_storeu_ps(y, vy_lo);
+      vy_lo = _mm256_extractf128_ps(vy, 1);
+      y += 4;
+    }
+    if (n & (2 * sizeof(uint8_t))) {
+      _mm_storel_pi((__m64*) y, vy_lo);
+      vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
+      y += 2;
+    }
+    if (n & (1 * sizeof(uint8_t))) {
+      _mm_store_ss(y, vy_lo);
+    }
+  }
+}
diff --git a/src/qu8-f32-vcvt/gen/vcvt-avx2-x24.c b/src/qu8-f32-vcvt/gen/vcvt-avx2-x24.c
new file mode 100644
index 0000000..5ef479f
--- /dev/null
+++ b/src/qu8-f32-vcvt/gen/vcvt-avx2-x24.c
@@ -0,0 +1,91 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-f32-vcvt/avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qu8_f32_vcvt_ukernel__avx2_x24(
+    size_t n,
+    const uint8_t* x,
+    float* y,
+    const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(uint8_t) == 0);
+  assert(x != NULL);
+  assert(y != NULL);
+
+  const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  for (; n >= 24 * sizeof(uint8_t); n -= 24 * sizeof(uint8_t)) {
+    __m256i vx01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    __m256i vx89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (x + 8)));
+    __m256i vxGHIJKLMN = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (x + 16)));
+    x += 24;
+
+    vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
+    vx89ABCDEF = _mm256_add_epi32(vx89ABCDEF, vminus_zero_point);
+    vxGHIJKLMN = _mm256_add_epi32(vxGHIJKLMN, vminus_zero_point);
+
+    __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
+    __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
+    __m256 vyGHIJKLMN = _mm256_cvtepi32_ps(vxGHIJKLMN);
+
+    vy01234567 = _mm256_mul_ps(vy01234567, vscale);
+    vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
+    vyGHIJKLMN = _mm256_mul_ps(vyGHIJKLMN, vscale);
+
+    _mm256_storeu_ps(y, vy01234567);
+    _mm256_storeu_ps(y + 8, vy89ABCDEF);
+    _mm256_storeu_ps(y + 16, vyGHIJKLMN);
+    y += 24;
+  }
+  for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+    __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    vx = _mm256_add_epi32(vx, vminus_zero_point);
+    x += 8;
+
+    __m256 vy = _mm256_cvtepi32_ps(vx);
+    vy = _mm256_mul_ps(vy, vscale);
+
+    _mm256_storeu_ps(y, vy);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(uint8_t));
+    assert(n <= 7 * sizeof(uint8_t));
+
+    __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    vx = _mm256_add_epi32(vx, vminus_zero_point);
+
+    __m256 vy = _mm256_cvtepi32_ps(vx);
+    vy = _mm256_mul_ps(vy, vscale);
+
+    __m128 vy_lo = _mm256_castps256_ps128(vy);
+    if (n & (4 * sizeof(uint8_t))) {
+      _mm_storeu_ps(y, vy_lo);
+      vy_lo = _mm256_extractf128_ps(vy, 1);
+      y += 4;
+    }
+    if (n & (2 * sizeof(uint8_t))) {
+      _mm_storel_pi((__m64*) y, vy_lo);
+      vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
+      y += 2;
+    }
+    if (n & (1 * sizeof(uint8_t))) {
+      _mm_store_ss(y, vy_lo);
+    }
+  }
+}
diff --git a/src/qu8-f32-vcvt/gen/vcvt-avx2-x32.c b/src/qu8-f32-vcvt/gen/vcvt-avx2-x32.c
new file mode 100644
index 0000000..90540d6
--- /dev/null
+++ b/src/qu8-f32-vcvt/gen/vcvt-avx2-x32.c
@@ -0,0 +1,96 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-f32-vcvt/avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qu8_f32_vcvt_ukernel__avx2_x32(
+    size_t n,
+    const uint8_t* x,
+    float* y,
+    const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(uint8_t) == 0);
+  assert(x != NULL);
+  assert(y != NULL);
+
+  const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
+    __m256i vx01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    __m256i vx89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (x + 8)));
+    __m256i vxGHIJKLMN = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (x + 16)));
+    __m256i vxOPQRSTUV = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (x + 24)));
+    x += 32;
+
+    vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
+    vx89ABCDEF = _mm256_add_epi32(vx89ABCDEF, vminus_zero_point);
+    vxGHIJKLMN = _mm256_add_epi32(vxGHIJKLMN, vminus_zero_point);
+    vxOPQRSTUV = _mm256_add_epi32(vxOPQRSTUV, vminus_zero_point);
+
+    __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
+    __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
+    __m256 vyGHIJKLMN = _mm256_cvtepi32_ps(vxGHIJKLMN);
+    __m256 vyOPQRSTUV = _mm256_cvtepi32_ps(vxOPQRSTUV);
+
+    vy01234567 = _mm256_mul_ps(vy01234567, vscale);
+    vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
+    vyGHIJKLMN = _mm256_mul_ps(vyGHIJKLMN, vscale);
+    vyOPQRSTUV = _mm256_mul_ps(vyOPQRSTUV, vscale);
+
+    _mm256_storeu_ps(y, vy01234567);
+    _mm256_storeu_ps(y + 8, vy89ABCDEF);
+    _mm256_storeu_ps(y + 16, vyGHIJKLMN);
+    _mm256_storeu_ps(y + 24, vyOPQRSTUV);
+    y += 32;
+  }
+  for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+    __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    vx = _mm256_add_epi32(vx, vminus_zero_point);
+    x += 8;
+
+    __m256 vy = _mm256_cvtepi32_ps(vx);
+    vy = _mm256_mul_ps(vy, vscale);
+
+    _mm256_storeu_ps(y, vy);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(uint8_t));
+    assert(n <= 7 * sizeof(uint8_t));
+
+    __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    vx = _mm256_add_epi32(vx, vminus_zero_point);
+
+    __m256 vy = _mm256_cvtepi32_ps(vx);
+    vy = _mm256_mul_ps(vy, vscale);
+
+    __m128 vy_lo = _mm256_castps256_ps128(vy);
+    if (n & (4 * sizeof(uint8_t))) {
+      _mm_storeu_ps(y, vy_lo);
+      vy_lo = _mm256_extractf128_ps(vy, 1);
+      y += 4;
+    }
+    if (n & (2 * sizeof(uint8_t))) {
+      _mm_storel_pi((__m64*) y, vy_lo);
+      vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
+      y += 2;
+    }
+    if (n & (1 * sizeof(uint8_t))) {
+      _mm_store_ss(y, vy_lo);
+    }
+  }
+}
diff --git a/src/qu8-f32-vcvt/gen/vcvt-avx2-x8.c b/src/qu8-f32-vcvt/gen/vcvt-avx2-x8.c
new file mode 100644
index 0000000..48a0792
--- /dev/null
+++ b/src/qu8-f32-vcvt/gen/vcvt-avx2-x8.c
@@ -0,0 +1,81 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-f32-vcvt/avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qu8_f32_vcvt_ukernel__avx2_x8(
+    size_t n,
+    const uint8_t* x,
+    float* y,
+    const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(uint8_t) == 0);
+  assert(x != NULL);
+  assert(y != NULL);
+
+  const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+    __m256i vx01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    x += 8;
+
+    vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
+
+    __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
+
+    vy01234567 = _mm256_mul_ps(vy01234567, vscale);
+
+    _mm256_storeu_ps(y, vy01234567);
+    y += 8;
+  }
+  for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+    __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    vx = _mm256_add_epi32(vx, vminus_zero_point);
+    x += 8;
+
+    __m256 vy = _mm256_cvtepi32_ps(vx);
+    vy = _mm256_mul_ps(vy, vscale);
+
+    _mm256_storeu_ps(y, vy);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(uint8_t));
+    assert(n <= 7 * sizeof(uint8_t));
+
+    __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
+    vx = _mm256_add_epi32(vx, vminus_zero_point);
+
+    __m256 vy = _mm256_cvtepi32_ps(vx);
+    vy = _mm256_mul_ps(vy, vscale);
+
+    __m128 vy_lo = _mm256_castps256_ps128(vy);
+    if (n & (4 * sizeof(uint8_t))) {
+      _mm_storeu_ps(y, vy_lo);
+      vy_lo = _mm256_extractf128_ps(vy, 1);
+      y += 4;
+    }
+    if (n & (2 * sizeof(uint8_t))) {
+      _mm_storel_pi((__m64*) y, vy_lo);
+      vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
+      y += 2;
+    }
+    if (n & (1 * sizeof(uint8_t))) {
+      _mm_store_ss(y, vy_lo);
+    }
+  }
+}
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 5dba744..26da611 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -1038,7 +1038,7 @@
     XNN_ALIGN(16) float scale[4];
   } sse4;
   struct {
-    XNN_ALIGN(16) int32_t minus_zero_point[4];
+    XNN_ALIGN(32) int32_t minus_zero_point[8];
     XNN_ALIGN(32) float scale[8];
   } avx;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -1072,7 +1072,7 @@
     XNN_ALIGN(16) float scale[4];
   } sse4;
   struct {
-    XNN_ALIGN(16) int32_t minus_zero_point[4];
+    XNN_ALIGN(32) int32_t minus_zero_point[8];
     XNN_ALIGN(32) float scale[8];
   } avx;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/xnnpack/vcvt.h b/src/xnnpack/vcvt.h
index c9ffd83..cfc1614 100644
--- a/src/xnnpack/vcvt.h
+++ b/src/xnnpack/vcvt.h
@@ -298,6 +298,11 @@
 DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx_x24)
 DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx_x32)
 
+DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx2_x8)
+DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx2_x16)
+DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx2_x24)
+DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__avx2_x32)
+
 DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__wasmsimd_x8)
 DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__wasmsimd_x16)
 DECLARE_QS8_F32_VCVT_UKERNEL_FUNCTION(xnn_qs8_f32_vcvt_ukernel__wasmsimd_x24)
@@ -336,6 +341,11 @@
 DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx_x24)
 DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx_x32)
 
+DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx2_x8)
+DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx2_x16)
+DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx2_x24)
+DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__avx2_x32)
+
 DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__wasmsimd_x8)
 DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__wasmsimd_x16)
 DECLARE_QU8_F32_VCVT_UKERNEL_FUNCTION(xnn_qu8_f32_vcvt_ukernel__wasmsimd_x24)
diff --git a/test/qs8-f32-vcvt.cc b/test/qs8-f32-vcvt.cc
index a6e54cc..6cb814f 100644
--- a/test/qs8-f32-vcvt.cc
+++ b/test/qs8-f32-vcvt.cc
@@ -961,6 +961,242 @@
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_F32_VCVT__AVX2_X8, batch_eq_8) {
+    TEST_REQUIRES_X86_AVX2;
+    VCvtMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x8, xnn_init_qs8_f32_cvt_avx_params);
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X8, batch_div_8) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x8, xnn_init_qs8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X8, batch_lt_8) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x8, xnn_init_qs8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X8, batch_gt_8) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x8, xnn_init_qs8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X8, scale) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(50)
+        .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x8, xnn_init_qs8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X8, zero_point) {
+    TEST_REQUIRES_X86_AVX2;
+    for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .zero_point(zero_point)
+          .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x8, xnn_init_qs8_f32_cvt_avx_params);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_F32_VCVT__AVX2_X16, batch_eq_16) {
+    TEST_REQUIRES_X86_AVX2;
+    VCvtMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x16, xnn_init_qs8_f32_cvt_avx_params);
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X16, batch_div_16) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x16, xnn_init_qs8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X16, batch_lt_16) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x16, xnn_init_qs8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X16, batch_gt_16) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x16, xnn_init_qs8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X16, scale) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(50)
+        .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x16, xnn_init_qs8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X16, zero_point) {
+    TEST_REQUIRES_X86_AVX2;
+    for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .zero_point(zero_point)
+          .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x16, xnn_init_qs8_f32_cvt_avx_params);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_F32_VCVT__AVX2_X24, batch_eq_24) {
+    TEST_REQUIRES_X86_AVX2;
+    VCvtMicrokernelTester()
+      .batch_size(24)
+      .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x24, xnn_init_qs8_f32_cvt_avx_params);
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X24, batch_div_24) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 48; batch_size < 240; batch_size += 24) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x24, xnn_init_qs8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X24, batch_lt_24) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size < 24; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x24, xnn_init_qs8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X24, batch_gt_24) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 25; batch_size < 48; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x24, xnn_init_qs8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X24, scale) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(50)
+        .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x24, xnn_init_qs8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X24, zero_point) {
+    TEST_REQUIRES_X86_AVX2;
+    for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+      for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .zero_point(zero_point)
+          .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x24, xnn_init_qs8_f32_cvt_avx_params);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_F32_VCVT__AVX2_X32, batch_eq_32) {
+    TEST_REQUIRES_X86_AVX2;
+    VCvtMicrokernelTester()
+      .batch_size(32)
+      .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x32, xnn_init_qs8_f32_cvt_avx_params);
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X32, batch_div_32) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x32, xnn_init_qs8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X32, batch_lt_32) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x32, xnn_init_qs8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X32, batch_gt_32) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x32, xnn_init_qs8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X32, scale) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(50)
+        .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x32, xnn_init_qs8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QS8_F32_VCVT__AVX2_X32, zero_point) {
+    TEST_REQUIRES_X86_AVX2;
+    for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .zero_point(zero_point)
+          .Test(xnn_qs8_f32_vcvt_ukernel__avx2_x32, xnn_init_qs8_f32_cvt_avx_params);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
 #if XNN_ARCH_WASMSIMD
   TEST(QS8_F32_VCVT__WASMSIMD_X8, batch_eq_8) {
     VCvtMicrokernelTester()
diff --git a/test/qs8-f32-vcvt.yaml b/test/qs8-f32-vcvt.yaml
index 0d44707..e40a9af 100644
--- a/test/qs8-f32-vcvt.yaml
+++ b/test/qs8-f32-vcvt.yaml
@@ -35,6 +35,14 @@
   init: xnn_init_qs8_f32_cvt_avx_params
 - name: xnn_qs8_f32_vcvt_ukernel__avx_x32
   init: xnn_init_qs8_f32_cvt_avx_params
+- name: xnn_qs8_f32_vcvt_ukernel__avx2_x8
+  init: xnn_init_qs8_f32_cvt_avx_params
+- name: xnn_qs8_f32_vcvt_ukernel__avx2_x16
+  init: xnn_init_qs8_f32_cvt_avx_params
+- name: xnn_qs8_f32_vcvt_ukernel__avx2_x24
+  init: xnn_init_qs8_f32_cvt_avx_params
+- name: xnn_qs8_f32_vcvt_ukernel__avx2_x32
+  init: xnn_init_qs8_f32_cvt_avx_params
 - name: xnn_qs8_f32_vcvt_ukernel__wasmsimd_x8
   init: xnn_init_qs8_f32_cvt_wasmsimd_params
 - name: xnn_qs8_f32_vcvt_ukernel__wasmsimd_x16
diff --git a/test/qu8-f32-vcvt.cc b/test/qu8-f32-vcvt.cc
index 95eac8f..058f687 100644
--- a/test/qu8-f32-vcvt.cc
+++ b/test/qu8-f32-vcvt.cc
@@ -961,6 +961,242 @@
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QU8_F32_VCVT__AVX2_X8, batch_eq_8) {
+    TEST_REQUIRES_X86_AVX2;
+    VCvtMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x8, xnn_init_qu8_f32_cvt_avx_params);
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X8, batch_div_8) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x8, xnn_init_qu8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X8, batch_lt_8) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x8, xnn_init_qu8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X8, batch_gt_8) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x8, xnn_init_qu8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X8, scale) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(50)
+        .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x8, xnn_init_qu8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X8, zero_point) {
+    TEST_REQUIRES_X86_AVX2;
+    for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .zero_point(zero_point)
+          .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x8, xnn_init_qu8_f32_cvt_avx_params);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QU8_F32_VCVT__AVX2_X16, batch_eq_16) {
+    TEST_REQUIRES_X86_AVX2;
+    VCvtMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x16, xnn_init_qu8_f32_cvt_avx_params);
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X16, batch_div_16) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x16, xnn_init_qu8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X16, batch_lt_16) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x16, xnn_init_qu8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X16, batch_gt_16) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x16, xnn_init_qu8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X16, scale) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(50)
+        .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x16, xnn_init_qu8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X16, zero_point) {
+    TEST_REQUIRES_X86_AVX2;
+    for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .zero_point(zero_point)
+          .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x16, xnn_init_qu8_f32_cvt_avx_params);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QU8_F32_VCVT__AVX2_X24, batch_eq_24) {
+    TEST_REQUIRES_X86_AVX2;
+    VCvtMicrokernelTester()
+      .batch_size(24)
+      .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x24, xnn_init_qu8_f32_cvt_avx_params);
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X24, batch_div_24) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 48; batch_size < 240; batch_size += 24) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x24, xnn_init_qu8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X24, batch_lt_24) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size < 24; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x24, xnn_init_qu8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X24, batch_gt_24) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 25; batch_size < 48; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x24, xnn_init_qu8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X24, scale) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(50)
+        .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x24, xnn_init_qu8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X24, zero_point) {
+    TEST_REQUIRES_X86_AVX2;
+    for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+      for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .zero_point(zero_point)
+          .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x24, xnn_init_qu8_f32_cvt_avx_params);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QU8_F32_VCVT__AVX2_X32, batch_eq_32) {
+    TEST_REQUIRES_X86_AVX2;
+    VCvtMicrokernelTester()
+      .batch_size(32)
+      .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x32, xnn_init_qu8_f32_cvt_avx_params);
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X32, batch_div_32) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x32, xnn_init_qu8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X32, batch_lt_32) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x32, xnn_init_qu8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X32, batch_gt_32) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x32, xnn_init_qu8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X32, scale) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+      VCvtMicrokernelTester()
+        .batch_size(batch_size)
+        .scale(50)
+        .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x32, xnn_init_qu8_f32_cvt_avx_params);
+    }
+  }
+
+  TEST(QU8_F32_VCVT__AVX2_X32, zero_point) {
+    TEST_REQUIRES_X86_AVX2;
+    for (int16_t zero_point = 0; zero_point < 5; zero_point += 2) {
+      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+        VCvtMicrokernelTester()
+          .batch_size(batch_size)
+          .zero_point(zero_point)
+          .Test(xnn_qu8_f32_vcvt_ukernel__avx2_x32, xnn_init_qu8_f32_cvt_avx_params);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
 #if XNN_ARCH_WASMSIMD
   TEST(QU8_F32_VCVT__WASMSIMD_X8, batch_eq_8) {
     VCvtMicrokernelTester()
diff --git a/test/qu8-f32-vcvt.yaml b/test/qu8-f32-vcvt.yaml
index 716a498..02d69f2 100644
--- a/test/qu8-f32-vcvt.yaml
+++ b/test/qu8-f32-vcvt.yaml
@@ -35,6 +35,14 @@
   init: xnn_init_qu8_f32_cvt_avx_params
 - name: xnn_qu8_f32_vcvt_ukernel__avx_x32
   init: xnn_init_qu8_f32_cvt_avx_params
+- name: xnn_qu8_f32_vcvt_ukernel__avx2_x8
+  init: xnn_init_qu8_f32_cvt_avx_params
+- name: xnn_qu8_f32_vcvt_ukernel__avx2_x16
+  init: xnn_init_qu8_f32_cvt_avx_params
+- name: xnn_qu8_f32_vcvt_ukernel__avx2_x24
+  init: xnn_init_qu8_f32_cvt_avx_params
+- name: xnn_qu8_f32_vcvt_ukernel__avx2_x32
+  init: xnn_init_qu8_f32_cvt_avx_params
 - name: xnn_qu8_f32_vcvt_ukernel__wasmsimd_x8
   init: xnn_init_qu8_f32_cvt_wasmsimd_params
 - name: xnn_qu8_f32_vcvt_ukernel__wasmsimd_x16