SSSE3, AVX, and AVX2 X8 LUT microkernels
PiperOrigin-RevId: 395948284
diff --git a/BUILD.bazel b/BUILD.bazel
index 3cfe853..b54c9ac 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -3970,6 +3970,8 @@
"src/qu8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c",
"src/qu8-requantization/gemmlowp-ssse3.c",
"src/qu8-requantization/rndna-ssse3.c",
+ "src/x8-lut/gen/lut-ssse3-x16.c",
+ "src/x8-lut/gen/lut-ssse3-x32.c",
]
PROD_SSE41_MICROKERNEL_SRCS = [
@@ -4640,6 +4642,10 @@
"src/qu8-vmul/gen/minmax-fp32-avx-mul16-ld64-x16.c",
"src/qu8-vmulc/gen/minmax-fp32-avx-mul16-ld64-x8.c",
"src/qu8-vmulc/gen/minmax-fp32-avx-mul16-ld64-x16.c",
+ "src/x8-lut/gen/lut-avx-x16.c",
+ "src/x8-lut/gen/lut-avx-x32.c",
+ "src/x8-lut/gen/lut-avx-x48.c",
+ "src/x8-lut/gen/lut-avx-x64.c",
]
PROD_XOP_MICROKERNEL_SRCS = [
@@ -5174,6 +5180,10 @@
"src/qu8-vadd/gen/minmax-avx2-mul32-ld64-x16.c",
"src/qu8-vaddc/gen/minmax-avx2-mul32-ld64-x8.c",
"src/qu8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c",
+ "src/x8-lut/gen/lut-avx2-x32.c",
+ "src/x8-lut/gen/lut-avx2-x64.c",
+ "src/x8-lut/gen/lut-avx2-x96.c",
+ "src/x8-lut/gen/lut-avx2-x128.c",
]
PROD_AVX512F_MICROKERNEL_SRCS = [
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 83a6344..a7a9303 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3019,7 +3019,9 @@
src/qu8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
src/qu8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
src/qu8-requantization/gemmlowp-ssse3.c
- src/qu8-requantization/rndna-ssse3.c)
+ src/qu8-requantization/rndna-ssse3.c
+ src/x8-lut/gen/lut-ssse3-x16.c
+ src/x8-lut/gen/lut-ssse3-x32.c)
SET(PROD_SSE41_MICROKERNEL_SRCS
src/f32-prelu/gen/sse41-2x8.c
@@ -3683,7 +3685,11 @@
src/qu8-vmul/gen/minmax-fp32-avx-mul16-ld64-x8.c
src/qu8-vmul/gen/minmax-fp32-avx-mul16-ld64-x16.c
src/qu8-vmulc/gen/minmax-fp32-avx-mul16-ld64-x8.c
- src/qu8-vmulc/gen/minmax-fp32-avx-mul16-ld64-x16.c)
+ src/qu8-vmulc/gen/minmax-fp32-avx-mul16-ld64-x16.c
+ src/x8-lut/gen/lut-avx-x16.c
+ src/x8-lut/gen/lut-avx-x32.c
+ src/x8-lut/gen/lut-avx-x48.c
+ src/x8-lut/gen/lut-avx-x64.c)
SET(PROD_XOP_MICROKERNEL_SRCS
src/qc8-dwconv/gen/up16x9-minmax-fp32-xop-mul16-add16.c
@@ -4213,7 +4219,11 @@
src/qu8-vadd/gen/minmax-avx2-mul32-ld64-x8.c
src/qu8-vadd/gen/minmax-avx2-mul32-ld64-x16.c
src/qu8-vaddc/gen/minmax-avx2-mul32-ld64-x8.c
- src/qu8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c)
+ src/qu8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c
+ src/x8-lut/gen/lut-avx2-x32.c
+ src/x8-lut/gen/lut-avx2-x64.c
+ src/x8-lut/gen/lut-avx2-x96.c
+ src/x8-lut/gen/lut-avx2-x128.c)
SET(PROD_AVX512F_MICROKERNEL_SRCS
src/f32-dwconv/gen/up16x4-minmax-avx512f.c
diff --git a/bench/x8-lut.cc b/bench/x8-lut.cc
index 6451cb6..5da2626 100644
--- a/bench/x8-lut.cc
+++ b/bench/x8-lut.cc
@@ -78,6 +78,61 @@
->UseRealTime();
#endif // XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ BENCHMARK_CAPTURE(x8_lut, avx2_x32,
+ xnn_x8_lut_ukernel__avx2_x32,
+ benchmark::utils::CheckAVX2)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(x8_lut, avx2_x64,
+ xnn_x8_lut_ukernel__avx2_x64,
+ benchmark::utils::CheckAVX2)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(x8_lut, avx2_x96,
+ xnn_x8_lut_ukernel__avx2_x96,
+ benchmark::utils::CheckAVX2)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(x8_lut, avx2_x128,
+ xnn_x8_lut_ukernel__avx2_x128,
+ benchmark::utils::CheckAVX2)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
+
+ BENCHMARK_CAPTURE(x8_lut, avx_x16,
+ xnn_x8_lut_ukernel__avx_x16,
+ benchmark::utils::CheckAVX)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(x8_lut, avx_x32,
+ xnn_x8_lut_ukernel__avx_x32,
+ benchmark::utils::CheckAVX)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(x8_lut, avx_x48,
+ xnn_x8_lut_ukernel__avx_x48,
+ benchmark::utils::CheckAVX)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(x8_lut, avx_x64,
+ xnn_x8_lut_ukernel__avx_x64,
+ benchmark::utils::CheckAVX)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
+
+ BENCHMARK_CAPTURE(x8_lut, ssse3_x16,
+ xnn_x8_lut_ukernel__ssse3_x16,
+ benchmark::utils::CheckSSSE3)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(x8_lut, ssse3_x32,
+ xnn_x8_lut_ukernel__ssse3_x32,
+ benchmark::utils::CheckSSSE3)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
BENCHMARK_CAPTURE(x8_lut, scalar_x1,
xnn_x8_lut_ukernel__scalar_x1)
->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
diff --git a/scripts/generate-x8-lut.sh b/scripts/generate-x8-lut.sh
index 9547369..1dccf62 100755
--- a/scripts/generate-x8-lut.sh
+++ b/scripts/generate-x8-lut.sh
@@ -17,6 +17,21 @@
tools/xngen src/x8-lut/neon-tbx128x4.c.in -D BATCH_TILE=48 -o src/x8-lut/gen/lut-neon-tbx128x4-x48.c &
tools/xngen src/x8-lut/neon-tbx128x4.c.in -D BATCH_TILE=64 -o src/x8-lut/gen/lut-neon-tbx128x4-x64.c &
+################################### x86 SSE ###################################
+tools/xngen src/x8-lut/ssse3.c.in -D AVX=0 -D BATCH_TILE=16 -o src/x8-lut/gen/lut-ssse3-x16.c &
+tools/xngen src/x8-lut/ssse3.c.in -D AVX=0 -D BATCH_TILE=32 -o src/x8-lut/gen/lut-ssse3-x32.c &
+
+tools/xngen src/x8-lut/ssse3.c.in -D AVX=1 -D BATCH_TILE=16 -o src/x8-lut/gen/lut-avx-x16.c &
+tools/xngen src/x8-lut/ssse3.c.in -D AVX=1 -D BATCH_TILE=32 -o src/x8-lut/gen/lut-avx-x32.c &
+tools/xngen src/x8-lut/ssse3.c.in -D AVX=1 -D BATCH_TILE=48 -o src/x8-lut/gen/lut-avx-x48.c &
+tools/xngen src/x8-lut/ssse3.c.in -D AVX=1 -D BATCH_TILE=64 -o src/x8-lut/gen/lut-avx-x64.c &
+
+################################### x86 AVX2 ##################################
+tools/xngen src/x8-lut/avx2.c.in -D BATCH_TILE=32 -o src/x8-lut/gen/lut-avx2-x32.c &
+tools/xngen src/x8-lut/avx2.c.in -D BATCH_TILE=64 -o src/x8-lut/gen/lut-avx2-x64.c &
+tools/xngen src/x8-lut/avx2.c.in -D BATCH_TILE=96 -o src/x8-lut/gen/lut-avx2-x96.c &
+tools/xngen src/x8-lut/avx2.c.in -D BATCH_TILE=128 -o src/x8-lut/gen/lut-avx2-x128.c &
+
################################## Unit tests #################################
tools/generate-lut-test.py --spec test/x8-lut.yaml --output test/x8-lut.cc &
diff --git a/src/x8-lut/avx2.c.in b/src/x8-lut/avx2.c.in
new file mode 100644
index 0000000..2efc520
--- /dev/null
+++ b/src/x8-lut/avx2.c.in
@@ -0,0 +1,114 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert BATCH_TILE >= 32
+$assert BATCH_TILE % 32 == 0
+$SIMD_TILE = BATCH_TILE // 32
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/lut.h>
+#include <xnnpack/common.h>
+
+
+void xnn_x8_lut_ukernel__avx2_x${BATCH_TILE}(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
+{
+ assert(n != 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256i vt0 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) t));
+ $for T in range(1, 16):
+ const __m256i vt${ABC[T]} = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + ${T * 16})));
+
+ const __m256i vtable0 = vt0;
+ $for T in range(1, 8):
+ const __m256i vtable${ABC[T]} = _mm256_xor_si256(vt${ABC[T-1]}, vt${ABC[T]});
+ $for T in range(8, 16):
+ const __m256i vtable${ABC[T]} = _mm256_xor_si256(_mm256_xor_si256(vt${ABC[T-1]}, vt${ABC[T]}), vtable${ABC[T-8]});
+
+ const __m256i voffset = _mm256_set1_epi8(16);
+ for (; n >= ${BATCH_TILE} * sizeof(uint8_t); n -= ${BATCH_TILE} * sizeof(uint8_t)) {
+ __m256i vx0 = _mm256_loadu_si256((const __m256i*) x);
+ $for N in range(1, SIMD_TILE):
+ __m256i vx${N} = _mm256_loadu_si256((const __m256i*) (x + ${N * 32}));
+ x += ${BATCH_TILE};
+
+ $for N in range(SIMD_TILE):
+ __m256i vy${N} = _mm256_shuffle_epi8(vtable0, vx${N});
+
+ $for T in range(1, 9):
+ $for N in range(SIMD_TILE):
+ vx${N} = _mm256_sub_epi8(vx${N}, voffset);
+ $for N in range(SIMD_TILE):
+ vy${N} = _mm256_xor_si256(vy${N}, _mm256_shuffle_epi8(vtable${ABC[T]}, vx${N}));
+
+ $for T in range(9, 16):
+ $for N in range(SIMD_TILE):
+ vx${N} = _mm256_subs_epi8(vx${N}, voffset);
+ $for N in range(SIMD_TILE):
+ vy${N} = _mm256_xor_si256(vy${N}, _mm256_shuffle_epi8(vtable${ABC[T]}, vx${N}));
+
+ _mm256_storeu_si256((__m256i*) y, vy0);
+ $for N in range(1, SIMD_TILE):
+ _mm256_storeu_si256((__m256i*) (y + ${N * 32}), vy${N});
+ y += ${BATCH_TILE};
+ }
+ for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+ x += 16;
+
+ __m128i vy = _mm_shuffle_epi8(_mm256_castsi256_si128(vtable0), vx);
+
+ $for T in range(1, 9):
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable${ABC[T]}), vx));
+
+ $for T in range(9, 16):
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable${ABC[T]}), vx));
+
+ _mm_storeu_si128((__m128i*) y, vy);
+ y += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+
+ __m128i vy = _mm_shuffle_epi8(_mm256_castsi256_si128(vtable0), vx);
+
+ $for T in range(1, 9):
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable${ABC[T]}), vx));
+
+ $for T in range(9, 16):
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable${ABC[T]}), vx));
+
+ if (n & (8 * sizeof(uint8_t))) {
+ _mm_storel_epi64((__m128i*) y, vy);
+ vy = _mm_unpackhi_epi64(vy, vy);
+ y += 8;
+ }
+ if (n & (4 * sizeof(uint8_t))) {
+ _mm_storeu_si32(y, vy);
+ vy = _mm_srli_epi64(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+ vy = _mm_srli_epi32(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ *y = (uint8_t) _mm_extract_epi8(vy, 0);
+ }
+ }
+}
diff --git a/src/x8-lut/gen/lut-avx-x16.c b/src/x8-lut/gen/lut-avx-x16.c
new file mode 100644
index 0000000..e21634a
--- /dev/null
+++ b/src/x8-lut/gen/lut-avx-x16.c
@@ -0,0 +1,160 @@
+// Auto-generated file. Do not edit!
+// Template: src/x8-lut/ssse3.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/lut.h>
+#include <xnnpack/common.h>
+
+
+void xnn_x8_lut_ukernel__avx_x16(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
+{
+ assert(n != 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m128i vt0 = _mm_load_si128((const __m128i*) t);
+ const __m128i vt1 = _mm_load_si128((const __m128i*) (t + 16));
+ const __m128i vt2 = _mm_load_si128((const __m128i*) (t + 32));
+ const __m128i vt3 = _mm_load_si128((const __m128i*) (t + 48));
+ const __m128i vt4 = _mm_load_si128((const __m128i*) (t + 64));
+ const __m128i vt5 = _mm_load_si128((const __m128i*) (t + 80));
+ const __m128i vt6 = _mm_load_si128((const __m128i*) (t + 96));
+ const __m128i vt7 = _mm_load_si128((const __m128i*) (t + 112));
+ const __m128i vt8 = _mm_load_si128((const __m128i*) (t + 128));
+ const __m128i vt9 = _mm_load_si128((const __m128i*) (t + 144));
+ const __m128i vtA = _mm_load_si128((const __m128i*) (t + 160));
+ const __m128i vtB = _mm_load_si128((const __m128i*) (t + 176));
+ const __m128i vtC = _mm_load_si128((const __m128i*) (t + 192));
+ const __m128i vtD = _mm_load_si128((const __m128i*) (t + 208));
+ const __m128i vtE = _mm_load_si128((const __m128i*) (t + 224));
+ const __m128i vtF = _mm_load_si128((const __m128i*) (t + 240));
+
+ const __m128i vtable0 = vt0;
+ const __m128i vtable1 = _mm_xor_si128(vt0, vt1);
+ const __m128i vtable2 = _mm_xor_si128(vt1, vt2);
+ const __m128i vtable3 = _mm_xor_si128(vt2, vt3);
+ const __m128i vtable4 = _mm_xor_si128(vt3, vt4);
+ const __m128i vtable5 = _mm_xor_si128(vt4, vt5);
+ const __m128i vtable6 = _mm_xor_si128(vt5, vt6);
+ const __m128i vtable7 = _mm_xor_si128(vt6, vt7);
+ const __m128i vtable8 = _mm_xor_si128(_mm_xor_si128(vt7, vt8), vtable0);
+ const __m128i vtable9 = _mm_xor_si128(_mm_xor_si128(vt8, vt9), vtable1);
+ const __m128i vtableA = _mm_xor_si128(_mm_xor_si128(vt9, vtA), vtable2);
+ const __m128i vtableB = _mm_xor_si128(_mm_xor_si128(vtA, vtB), vtable3);
+ const __m128i vtableC = _mm_xor_si128(_mm_xor_si128(vtB, vtC), vtable4);
+ const __m128i vtableD = _mm_xor_si128(_mm_xor_si128(vtC, vtD), vtable5);
+ const __m128i vtableE = _mm_xor_si128(_mm_xor_si128(vtD, vtE), vtable6);
+ const __m128i vtableF = _mm_xor_si128(_mm_xor_si128(vtE, vtF), vtable7);
+
+ const __m128i voffset = _mm_set1_epi8(16);
+ for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+ x += 16;
+
+ __m128i vy = _mm_shuffle_epi8(vtable0, vx);
+
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable1, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable2, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable3, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable4, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable5, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable6, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable7, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable8, vx));
+
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable9, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableA, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableB, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableC, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableD, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableE, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableF, vx));
+
+ _mm_storeu_si128((__m128i*) y, vy);
+ y += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+
+ __m128i vy = _mm_shuffle_epi8(vtable0, vx);
+
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable1, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable2, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable3, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable4, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable5, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable6, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable7, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable8, vx));
+
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable9, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableA, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableB, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableC, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableD, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableE, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableF, vx));
+
+ if (n & (8 * sizeof(uint8_t))) {
+ _mm_storel_epi64((__m128i*) y, vy);
+ vy = _mm_unpackhi_epi64(vy, vy);
+ y += 8;
+ }
+ if (n & (4 * sizeof(uint8_t))) {
+ _mm_storeu_si32(y, vy);
+ vy = _mm_srli_epi64(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+ vy = _mm_srli_epi32(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ *y = (uint8_t) _mm_extract_epi8(vy, 0);
+ }
+ }
+}
diff --git a/src/x8-lut/gen/lut-avx-x32.c b/src/x8-lut/gen/lut-avx-x32.c
new file mode 100644
index 0000000..2a1c454
--- /dev/null
+++ b/src/x8-lut/gen/lut-avx-x32.c
@@ -0,0 +1,234 @@
+// Auto-generated file. Do not edit!
+// Template: src/x8-lut/ssse3.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/lut.h>
+#include <xnnpack/common.h>
+
+
+void xnn_x8_lut_ukernel__avx_x32(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
+{
+ assert(n != 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m128i vt0 = _mm_load_si128((const __m128i*) t);
+ const __m128i vt1 = _mm_load_si128((const __m128i*) (t + 16));
+ const __m128i vt2 = _mm_load_si128((const __m128i*) (t + 32));
+ const __m128i vt3 = _mm_load_si128((const __m128i*) (t + 48));
+ const __m128i vt4 = _mm_load_si128((const __m128i*) (t + 64));
+ const __m128i vt5 = _mm_load_si128((const __m128i*) (t + 80));
+ const __m128i vt6 = _mm_load_si128((const __m128i*) (t + 96));
+ const __m128i vt7 = _mm_load_si128((const __m128i*) (t + 112));
+ const __m128i vt8 = _mm_load_si128((const __m128i*) (t + 128));
+ const __m128i vt9 = _mm_load_si128((const __m128i*) (t + 144));
+ const __m128i vtA = _mm_load_si128((const __m128i*) (t + 160));
+ const __m128i vtB = _mm_load_si128((const __m128i*) (t + 176));
+ const __m128i vtC = _mm_load_si128((const __m128i*) (t + 192));
+ const __m128i vtD = _mm_load_si128((const __m128i*) (t + 208));
+ const __m128i vtE = _mm_load_si128((const __m128i*) (t + 224));
+ const __m128i vtF = _mm_load_si128((const __m128i*) (t + 240));
+
+ const __m128i vtable0 = vt0;
+ const __m128i vtable1 = _mm_xor_si128(vt0, vt1);
+ const __m128i vtable2 = _mm_xor_si128(vt1, vt2);
+ const __m128i vtable3 = _mm_xor_si128(vt2, vt3);
+ const __m128i vtable4 = _mm_xor_si128(vt3, vt4);
+ const __m128i vtable5 = _mm_xor_si128(vt4, vt5);
+ const __m128i vtable6 = _mm_xor_si128(vt5, vt6);
+ const __m128i vtable7 = _mm_xor_si128(vt6, vt7);
+ const __m128i vtable8 = _mm_xor_si128(_mm_xor_si128(vt7, vt8), vtable0);
+ const __m128i vtable9 = _mm_xor_si128(_mm_xor_si128(vt8, vt9), vtable1);
+ const __m128i vtableA = _mm_xor_si128(_mm_xor_si128(vt9, vtA), vtable2);
+ const __m128i vtableB = _mm_xor_si128(_mm_xor_si128(vtA, vtB), vtable3);
+ const __m128i vtableC = _mm_xor_si128(_mm_xor_si128(vtB, vtC), vtable4);
+ const __m128i vtableD = _mm_xor_si128(_mm_xor_si128(vtC, vtD), vtable5);
+ const __m128i vtableE = _mm_xor_si128(_mm_xor_si128(vtD, vtE), vtable6);
+ const __m128i vtableF = _mm_xor_si128(_mm_xor_si128(vtE, vtF), vtable7);
+
+ const __m128i voffset = _mm_set1_epi8(16);
+ for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
+ __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
+ __m128i vx1 = _mm_loadu_si128((const __m128i*) (x + 16));
+ x += 32;
+
+ __m128i vy0 = _mm_shuffle_epi8(vtable0, vx0);
+ __m128i vy1 = _mm_shuffle_epi8(vtable0, vx1);
+
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable1, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable1, vx1));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable2, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable2, vx1));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable3, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable3, vx1));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable4, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable4, vx1));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable5, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable5, vx1));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable6, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable6, vx1));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable7, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable7, vx1));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable8, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable8, vx1));
+
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable9, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable9, vx1));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableA, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableA, vx1));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableB, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableB, vx1));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableC, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableC, vx1));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableD, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableD, vx1));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableE, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableE, vx1));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableF, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableF, vx1));
+
+ _mm_storeu_si128((__m128i*) y, vy0);
+ _mm_storeu_si128((__m128i*) (y + 16), vy1);
+ y += 32;
+ }
+ for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+ x += 16;
+
+ __m128i vy = _mm_shuffle_epi8(vtable0, vx);
+
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable1, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable2, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable3, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable4, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable5, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable6, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable7, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable8, vx));
+
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable9, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableA, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableB, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableC, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableD, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableE, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableF, vx));
+
+ _mm_storeu_si128((__m128i*) y, vy);
+ y += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+
+ __m128i vy = _mm_shuffle_epi8(vtable0, vx);
+
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable1, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable2, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable3, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable4, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable5, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable6, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable7, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable8, vx));
+
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable9, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableA, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableB, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableC, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableD, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableE, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableF, vx));
+
+ if (n & (8 * sizeof(uint8_t))) {
+ _mm_storel_epi64((__m128i*) y, vy);
+ vy = _mm_unpackhi_epi64(vy, vy);
+ y += 8;
+ }
+ if (n & (4 * sizeof(uint8_t))) {
+ _mm_storeu_si32(y, vy);
+ vy = _mm_srli_epi64(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+ vy = _mm_srli_epi32(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ *y = (uint8_t) _mm_extract_epi8(vy, 0);
+ }
+ }
+}
diff --git a/src/x8-lut/gen/lut-avx-x48.c b/src/x8-lut/gen/lut-avx-x48.c
new file mode 100644
index 0000000..bd7d041
--- /dev/null
+++ b/src/x8-lut/gen/lut-avx-x48.c
@@ -0,0 +1,267 @@
+// Auto-generated file. Do not edit!
+// Template: src/x8-lut/ssse3.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/lut.h>
+#include <xnnpack/common.h>
+
+
+void xnn_x8_lut_ukernel__avx_x48(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
+{
+ assert(n != 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m128i vt0 = _mm_load_si128((const __m128i*) t);
+ const __m128i vt1 = _mm_load_si128((const __m128i*) (t + 16));
+ const __m128i vt2 = _mm_load_si128((const __m128i*) (t + 32));
+ const __m128i vt3 = _mm_load_si128((const __m128i*) (t + 48));
+ const __m128i vt4 = _mm_load_si128((const __m128i*) (t + 64));
+ const __m128i vt5 = _mm_load_si128((const __m128i*) (t + 80));
+ const __m128i vt6 = _mm_load_si128((const __m128i*) (t + 96));
+ const __m128i vt7 = _mm_load_si128((const __m128i*) (t + 112));
+ const __m128i vt8 = _mm_load_si128((const __m128i*) (t + 128));
+ const __m128i vt9 = _mm_load_si128((const __m128i*) (t + 144));
+ const __m128i vtA = _mm_load_si128((const __m128i*) (t + 160));
+ const __m128i vtB = _mm_load_si128((const __m128i*) (t + 176));
+ const __m128i vtC = _mm_load_si128((const __m128i*) (t + 192));
+ const __m128i vtD = _mm_load_si128((const __m128i*) (t + 208));
+ const __m128i vtE = _mm_load_si128((const __m128i*) (t + 224));
+ const __m128i vtF = _mm_load_si128((const __m128i*) (t + 240));
+
+ const __m128i vtable0 = vt0;
+ const __m128i vtable1 = _mm_xor_si128(vt0, vt1);
+ const __m128i vtable2 = _mm_xor_si128(vt1, vt2);
+ const __m128i vtable3 = _mm_xor_si128(vt2, vt3);
+ const __m128i vtable4 = _mm_xor_si128(vt3, vt4);
+ const __m128i vtable5 = _mm_xor_si128(vt4, vt5);
+ const __m128i vtable6 = _mm_xor_si128(vt5, vt6);
+ const __m128i vtable7 = _mm_xor_si128(vt6, vt7);
+ const __m128i vtable8 = _mm_xor_si128(_mm_xor_si128(vt7, vt8), vtable0);
+ const __m128i vtable9 = _mm_xor_si128(_mm_xor_si128(vt8, vt9), vtable1);
+ const __m128i vtableA = _mm_xor_si128(_mm_xor_si128(vt9, vtA), vtable2);
+ const __m128i vtableB = _mm_xor_si128(_mm_xor_si128(vtA, vtB), vtable3);
+ const __m128i vtableC = _mm_xor_si128(_mm_xor_si128(vtB, vtC), vtable4);
+ const __m128i vtableD = _mm_xor_si128(_mm_xor_si128(vtC, vtD), vtable5);
+ const __m128i vtableE = _mm_xor_si128(_mm_xor_si128(vtD, vtE), vtable6);
+ const __m128i vtableF = _mm_xor_si128(_mm_xor_si128(vtE, vtF), vtable7);
+
+ const __m128i voffset = _mm_set1_epi8(16);
+ for (; n >= 48 * sizeof(uint8_t); n -= 48 * sizeof(uint8_t)) {
+ __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
+ __m128i vx1 = _mm_loadu_si128((const __m128i*) (x + 16));
+ __m128i vx2 = _mm_loadu_si128((const __m128i*) (x + 32));
+ x += 48;
+
+ __m128i vy0 = _mm_shuffle_epi8(vtable0, vx0);
+ __m128i vy1 = _mm_shuffle_epi8(vtable0, vx1);
+ __m128i vy2 = _mm_shuffle_epi8(vtable0, vx2);
+
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vx2 = _mm_sub_epi8(vx2, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable1, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable1, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable1, vx2));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vx2 = _mm_sub_epi8(vx2, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable2, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable2, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable2, vx2));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vx2 = _mm_sub_epi8(vx2, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable3, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable3, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable3, vx2));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vx2 = _mm_sub_epi8(vx2, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable4, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable4, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable4, vx2));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vx2 = _mm_sub_epi8(vx2, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable5, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable5, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable5, vx2));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vx2 = _mm_sub_epi8(vx2, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable6, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable6, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable6, vx2));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vx2 = _mm_sub_epi8(vx2, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable7, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable7, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable7, vx2));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vx2 = _mm_sub_epi8(vx2, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable8, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable8, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable8, vx2));
+
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vx2 = _mm_subs_epi8(vx2, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable9, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable9, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable9, vx2));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vx2 = _mm_subs_epi8(vx2, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableA, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableA, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableA, vx2));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vx2 = _mm_subs_epi8(vx2, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableB, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableB, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableB, vx2));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vx2 = _mm_subs_epi8(vx2, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableC, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableC, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableC, vx2));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vx2 = _mm_subs_epi8(vx2, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableD, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableD, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableD, vx2));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vx2 = _mm_subs_epi8(vx2, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableE, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableE, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableE, vx2));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vx2 = _mm_subs_epi8(vx2, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableF, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableF, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableF, vx2));
+
+ _mm_storeu_si128((__m128i*) y, vy0);
+ _mm_storeu_si128((__m128i*) (y + 16), vy1);
+ _mm_storeu_si128((__m128i*) (y + 32), vy2);
+ y += 48;
+ }
+ for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+ x += 16;
+
+ __m128i vy = _mm_shuffle_epi8(vtable0, vx);
+
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable1, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable2, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable3, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable4, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable5, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable6, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable7, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable8, vx));
+
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable9, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableA, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableB, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableC, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableD, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableE, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableF, vx));
+
+ _mm_storeu_si128((__m128i*) y, vy);
+ y += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+
+ __m128i vy = _mm_shuffle_epi8(vtable0, vx);
+
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable1, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable2, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable3, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable4, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable5, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable6, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable7, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable8, vx));
+
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable9, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableA, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableB, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableC, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableD, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableE, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableF, vx));
+
+ if (n & (8 * sizeof(uint8_t))) {
+ _mm_storel_epi64((__m128i*) y, vy);
+ vy = _mm_unpackhi_epi64(vy, vy);
+ y += 8;
+ }
+ if (n & (4 * sizeof(uint8_t))) {
+ _mm_storeu_si32(y, vy);
+ vy = _mm_srli_epi64(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+ vy = _mm_srli_epi32(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ *y = (uint8_t) _mm_extract_epi8(vy, 0);
+ }
+ }
+}
diff --git a/src/x8-lut/gen/lut-avx-x64.c b/src/x8-lut/gen/lut-avx-x64.c
new file mode 100644
index 0000000..98cefbf
--- /dev/null
+++ b/src/x8-lut/gen/lut-avx-x64.c
@@ -0,0 +1,300 @@
+// Auto-generated file. Do not edit!
+// Template: src/x8-lut/ssse3.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/lut.h>
+#include <xnnpack/common.h>
+
+
+void xnn_x8_lut_ukernel__avx_x64(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
+{
+ assert(n != 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m128i vt0 = _mm_load_si128((const __m128i*) t);
+ const __m128i vt1 = _mm_load_si128((const __m128i*) (t + 16));
+ const __m128i vt2 = _mm_load_si128((const __m128i*) (t + 32));
+ const __m128i vt3 = _mm_load_si128((const __m128i*) (t + 48));
+ const __m128i vt4 = _mm_load_si128((const __m128i*) (t + 64));
+ const __m128i vt5 = _mm_load_si128((const __m128i*) (t + 80));
+ const __m128i vt6 = _mm_load_si128((const __m128i*) (t + 96));
+ const __m128i vt7 = _mm_load_si128((const __m128i*) (t + 112));
+ const __m128i vt8 = _mm_load_si128((const __m128i*) (t + 128));
+ const __m128i vt9 = _mm_load_si128((const __m128i*) (t + 144));
+ const __m128i vtA = _mm_load_si128((const __m128i*) (t + 160));
+ const __m128i vtB = _mm_load_si128((const __m128i*) (t + 176));
+ const __m128i vtC = _mm_load_si128((const __m128i*) (t + 192));
+ const __m128i vtD = _mm_load_si128((const __m128i*) (t + 208));
+ const __m128i vtE = _mm_load_si128((const __m128i*) (t + 224));
+ const __m128i vtF = _mm_load_si128((const __m128i*) (t + 240));
+
+ const __m128i vtable0 = vt0;
+ const __m128i vtable1 = _mm_xor_si128(vt0, vt1);
+ const __m128i vtable2 = _mm_xor_si128(vt1, vt2);
+ const __m128i vtable3 = _mm_xor_si128(vt2, vt3);
+ const __m128i vtable4 = _mm_xor_si128(vt3, vt4);
+ const __m128i vtable5 = _mm_xor_si128(vt4, vt5);
+ const __m128i vtable6 = _mm_xor_si128(vt5, vt6);
+ const __m128i vtable7 = _mm_xor_si128(vt6, vt7);
+ const __m128i vtable8 = _mm_xor_si128(_mm_xor_si128(vt7, vt8), vtable0);
+ const __m128i vtable9 = _mm_xor_si128(_mm_xor_si128(vt8, vt9), vtable1);
+ const __m128i vtableA = _mm_xor_si128(_mm_xor_si128(vt9, vtA), vtable2);
+ const __m128i vtableB = _mm_xor_si128(_mm_xor_si128(vtA, vtB), vtable3);
+ const __m128i vtableC = _mm_xor_si128(_mm_xor_si128(vtB, vtC), vtable4);
+ const __m128i vtableD = _mm_xor_si128(_mm_xor_si128(vtC, vtD), vtable5);
+ const __m128i vtableE = _mm_xor_si128(_mm_xor_si128(vtD, vtE), vtable6);
+ const __m128i vtableF = _mm_xor_si128(_mm_xor_si128(vtE, vtF), vtable7);
+
+ const __m128i voffset = _mm_set1_epi8(16);
+ for (; n >= 64 * sizeof(uint8_t); n -= 64 * sizeof(uint8_t)) {
+ __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
+ __m128i vx1 = _mm_loadu_si128((const __m128i*) (x + 16));
+ __m128i vx2 = _mm_loadu_si128((const __m128i*) (x + 32));
+ __m128i vx3 = _mm_loadu_si128((const __m128i*) (x + 48));
+ x += 64;
+
+ __m128i vy0 = _mm_shuffle_epi8(vtable0, vx0);
+ __m128i vy1 = _mm_shuffle_epi8(vtable0, vx1);
+ __m128i vy2 = _mm_shuffle_epi8(vtable0, vx2);
+ __m128i vy3 = _mm_shuffle_epi8(vtable0, vx3);
+
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vx2 = _mm_sub_epi8(vx2, voffset);
+ vx3 = _mm_sub_epi8(vx3, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable1, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable1, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable1, vx2));
+ vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable1, vx3));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vx2 = _mm_sub_epi8(vx2, voffset);
+ vx3 = _mm_sub_epi8(vx3, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable2, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable2, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable2, vx2));
+ vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable2, vx3));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vx2 = _mm_sub_epi8(vx2, voffset);
+ vx3 = _mm_sub_epi8(vx3, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable3, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable3, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable3, vx2));
+ vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable3, vx3));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vx2 = _mm_sub_epi8(vx2, voffset);
+ vx3 = _mm_sub_epi8(vx3, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable4, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable4, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable4, vx2));
+ vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable4, vx3));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vx2 = _mm_sub_epi8(vx2, voffset);
+ vx3 = _mm_sub_epi8(vx3, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable5, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable5, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable5, vx2));
+ vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable5, vx3));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vx2 = _mm_sub_epi8(vx2, voffset);
+ vx3 = _mm_sub_epi8(vx3, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable6, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable6, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable6, vx2));
+ vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable6, vx3));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vx2 = _mm_sub_epi8(vx2, voffset);
+ vx3 = _mm_sub_epi8(vx3, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable7, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable7, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable7, vx2));
+ vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable7, vx3));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vx2 = _mm_sub_epi8(vx2, voffset);
+ vx3 = _mm_sub_epi8(vx3, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable8, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable8, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable8, vx2));
+ vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable8, vx3));
+
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vx2 = _mm_subs_epi8(vx2, voffset);
+ vx3 = _mm_subs_epi8(vx3, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable9, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable9, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable9, vx2));
+ vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable9, vx3));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vx2 = _mm_subs_epi8(vx2, voffset);
+ vx3 = _mm_subs_epi8(vx3, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableA, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableA, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableA, vx2));
+ vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtableA, vx3));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vx2 = _mm_subs_epi8(vx2, voffset);
+ vx3 = _mm_subs_epi8(vx3, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableB, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableB, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableB, vx2));
+ vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtableB, vx3));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vx2 = _mm_subs_epi8(vx2, voffset);
+ vx3 = _mm_subs_epi8(vx3, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableC, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableC, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableC, vx2));
+ vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtableC, vx3));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vx2 = _mm_subs_epi8(vx2, voffset);
+ vx3 = _mm_subs_epi8(vx3, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableD, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableD, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableD, vx2));
+ vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtableD, vx3));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vx2 = _mm_subs_epi8(vx2, voffset);
+ vx3 = _mm_subs_epi8(vx3, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableE, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableE, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableE, vx2));
+ vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtableE, vx3));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vx2 = _mm_subs_epi8(vx2, voffset);
+ vx3 = _mm_subs_epi8(vx3, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableF, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableF, vx1));
+ vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableF, vx2));
+ vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtableF, vx3));
+
+ _mm_storeu_si128((__m128i*) y, vy0);
+ _mm_storeu_si128((__m128i*) (y + 16), vy1);
+ _mm_storeu_si128((__m128i*) (y + 32), vy2);
+ _mm_storeu_si128((__m128i*) (y + 48), vy3);
+ y += 64;
+ }
+ for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+ x += 16;
+
+ __m128i vy = _mm_shuffle_epi8(vtable0, vx);
+
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable1, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable2, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable3, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable4, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable5, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable6, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable7, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable8, vx));
+
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable9, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableA, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableB, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableC, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableD, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableE, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableF, vx));
+
+ _mm_storeu_si128((__m128i*) y, vy);
+ y += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+
+ __m128i vy = _mm_shuffle_epi8(vtable0, vx);
+
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable1, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable2, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable3, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable4, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable5, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable6, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable7, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable8, vx));
+
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable9, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableA, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableB, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableC, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableD, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableE, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableF, vx));
+
+ if (n & (8 * sizeof(uint8_t))) {
+ _mm_storel_epi64((__m128i*) y, vy);
+ vy = _mm_unpackhi_epi64(vy, vy);
+ y += 8;
+ }
+ if (n & (4 * sizeof(uint8_t))) {
+ _mm_storeu_si32(y, vy);
+ vy = _mm_srli_epi64(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+ vy = _mm_srli_epi32(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ *y = (uint8_t) _mm_extract_epi8(vy, 0);
+ }
+ }
+}
diff --git a/src/x8-lut/gen/lut-avx2-x128.c b/src/x8-lut/gen/lut-avx2-x128.c
new file mode 100644
index 0000000..c61821d
--- /dev/null
+++ b/src/x8-lut/gen/lut-avx2-x128.c
@@ -0,0 +1,300 @@
+// Auto-generated file. Do not edit!
+// Template: src/x8-lut/avx2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/lut.h>
+#include <xnnpack/common.h>
+
+
+void xnn_x8_lut_ukernel__avx2_x128(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
+{
+ assert(n != 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256i vt0 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) t));
+ const __m256i vt1 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 16)));
+ const __m256i vt2 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 32)));
+ const __m256i vt3 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 48)));
+ const __m256i vt4 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 64)));
+ const __m256i vt5 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 80)));
+ const __m256i vt6 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 96)));
+ const __m256i vt7 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 112)));
+ const __m256i vt8 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 128)));
+ const __m256i vt9 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 144)));
+ const __m256i vtA = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 160)));
+ const __m256i vtB = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 176)));
+ const __m256i vtC = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 192)));
+ const __m256i vtD = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 208)));
+ const __m256i vtE = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 224)));
+ const __m256i vtF = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 240)));
+
+ const __m256i vtable0 = vt0;
+ const __m256i vtable1 = _mm256_xor_si256(vt0, vt1);
+ const __m256i vtable2 = _mm256_xor_si256(vt1, vt2);
+ const __m256i vtable3 = _mm256_xor_si256(vt2, vt3);
+ const __m256i vtable4 = _mm256_xor_si256(vt3, vt4);
+ const __m256i vtable5 = _mm256_xor_si256(vt4, vt5);
+ const __m256i vtable6 = _mm256_xor_si256(vt5, vt6);
+ const __m256i vtable7 = _mm256_xor_si256(vt6, vt7);
+ const __m256i vtable8 = _mm256_xor_si256(_mm256_xor_si256(vt7, vt8), vtable0);
+ const __m256i vtable9 = _mm256_xor_si256(_mm256_xor_si256(vt8, vt9), vtable1);
+ const __m256i vtableA = _mm256_xor_si256(_mm256_xor_si256(vt9, vtA), vtable2);
+ const __m256i vtableB = _mm256_xor_si256(_mm256_xor_si256(vtA, vtB), vtable3);
+ const __m256i vtableC = _mm256_xor_si256(_mm256_xor_si256(vtB, vtC), vtable4);
+ const __m256i vtableD = _mm256_xor_si256(_mm256_xor_si256(vtC, vtD), vtable5);
+ const __m256i vtableE = _mm256_xor_si256(_mm256_xor_si256(vtD, vtE), vtable6);
+ const __m256i vtableF = _mm256_xor_si256(_mm256_xor_si256(vtE, vtF), vtable7);
+
+ const __m256i voffset = _mm256_set1_epi8(16);
+ for (; n >= 128 * sizeof(uint8_t); n -= 128 * sizeof(uint8_t)) {
+ __m256i vx0 = _mm256_loadu_si256((const __m256i*) x);
+ __m256i vx1 = _mm256_loadu_si256((const __m256i*) (x + 32));
+ __m256i vx2 = _mm256_loadu_si256((const __m256i*) (x + 64));
+ __m256i vx3 = _mm256_loadu_si256((const __m256i*) (x + 96));
+ x += 128;
+
+ __m256i vy0 = _mm256_shuffle_epi8(vtable0, vx0);
+ __m256i vy1 = _mm256_shuffle_epi8(vtable0, vx1);
+ __m256i vy2 = _mm256_shuffle_epi8(vtable0, vx2);
+ __m256i vy3 = _mm256_shuffle_epi8(vtable0, vx3);
+
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vx2 = _mm256_sub_epi8(vx2, voffset);
+ vx3 = _mm256_sub_epi8(vx3, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable1, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable1, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable1, vx2));
+ vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable1, vx3));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vx2 = _mm256_sub_epi8(vx2, voffset);
+ vx3 = _mm256_sub_epi8(vx3, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable2, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable2, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable2, vx2));
+ vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable2, vx3));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vx2 = _mm256_sub_epi8(vx2, voffset);
+ vx3 = _mm256_sub_epi8(vx3, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable3, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable3, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable3, vx2));
+ vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable3, vx3));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vx2 = _mm256_sub_epi8(vx2, voffset);
+ vx3 = _mm256_sub_epi8(vx3, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable4, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable4, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable4, vx2));
+ vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable4, vx3));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vx2 = _mm256_sub_epi8(vx2, voffset);
+ vx3 = _mm256_sub_epi8(vx3, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable5, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable5, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable5, vx2));
+ vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable5, vx3));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vx2 = _mm256_sub_epi8(vx2, voffset);
+ vx3 = _mm256_sub_epi8(vx3, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable6, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable6, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable6, vx2));
+ vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable6, vx3));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vx2 = _mm256_sub_epi8(vx2, voffset);
+ vx3 = _mm256_sub_epi8(vx3, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable7, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable7, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable7, vx2));
+ vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable7, vx3));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vx2 = _mm256_sub_epi8(vx2, voffset);
+ vx3 = _mm256_sub_epi8(vx3, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable8, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable8, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable8, vx2));
+ vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable8, vx3));
+
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vx2 = _mm256_subs_epi8(vx2, voffset);
+ vx3 = _mm256_subs_epi8(vx3, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable9, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable9, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable9, vx2));
+ vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable9, vx3));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vx2 = _mm256_subs_epi8(vx2, voffset);
+ vx3 = _mm256_subs_epi8(vx3, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableA, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableA, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableA, vx2));
+ vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableA, vx3));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vx2 = _mm256_subs_epi8(vx2, voffset);
+ vx3 = _mm256_subs_epi8(vx3, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableB, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableB, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableB, vx2));
+ vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableB, vx3));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vx2 = _mm256_subs_epi8(vx2, voffset);
+ vx3 = _mm256_subs_epi8(vx3, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableC, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableC, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableC, vx2));
+ vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableC, vx3));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vx2 = _mm256_subs_epi8(vx2, voffset);
+ vx3 = _mm256_subs_epi8(vx3, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableD, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableD, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableD, vx2));
+ vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableD, vx3));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vx2 = _mm256_subs_epi8(vx2, voffset);
+ vx3 = _mm256_subs_epi8(vx3, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableE, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableE, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableE, vx2));
+ vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableE, vx3));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vx2 = _mm256_subs_epi8(vx2, voffset);
+ vx3 = _mm256_subs_epi8(vx3, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableF, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableF, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableF, vx2));
+ vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableF, vx3));
+
+ _mm256_storeu_si256((__m256i*) y, vy0);
+ _mm256_storeu_si256((__m256i*) (y + 32), vy1);
+ _mm256_storeu_si256((__m256i*) (y + 64), vy2);
+ _mm256_storeu_si256((__m256i*) (y + 96), vy3);
+ y += 128;
+ }
+ for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+ x += 16;
+
+ __m128i vy = _mm_shuffle_epi8(_mm256_castsi256_si128(vtable0), vx);
+
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable1), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable2), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable3), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable4), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable5), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable6), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable7), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable8), vx));
+
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable9), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableA), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableB), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableC), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableD), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableE), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableF), vx));
+
+ _mm_storeu_si128((__m128i*) y, vy);
+ y += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+
+ __m128i vy = _mm_shuffle_epi8(_mm256_castsi256_si128(vtable0), vx);
+
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable1), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable2), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable3), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable4), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable5), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable6), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable7), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable8), vx));
+
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable9), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableA), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableB), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableC), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableD), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableE), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableF), vx));
+
+ if (n & (8 * sizeof(uint8_t))) {
+ _mm_storel_epi64((__m128i*) y, vy);
+ vy = _mm_unpackhi_epi64(vy, vy);
+ y += 8;
+ }
+ if (n & (4 * sizeof(uint8_t))) {
+ _mm_storeu_si32(y, vy);
+ vy = _mm_srli_epi64(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+ vy = _mm_srli_epi32(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ *y = (uint8_t) _mm_extract_epi8(vy, 0);
+ }
+ }
+}
diff --git a/src/x8-lut/gen/lut-avx2-x32.c b/src/x8-lut/gen/lut-avx2-x32.c
new file mode 100644
index 0000000..344baca
--- /dev/null
+++ b/src/x8-lut/gen/lut-avx2-x32.c
@@ -0,0 +1,201 @@
+// Auto-generated file. Do not edit!
+// Template: src/x8-lut/avx2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/lut.h>
+#include <xnnpack/common.h>
+
+
+void xnn_x8_lut_ukernel__avx2_x32(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
+{
+ assert(n != 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256i vt0 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) t));
+ const __m256i vt1 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 16)));
+ const __m256i vt2 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 32)));
+ const __m256i vt3 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 48)));
+ const __m256i vt4 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 64)));
+ const __m256i vt5 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 80)));
+ const __m256i vt6 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 96)));
+ const __m256i vt7 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 112)));
+ const __m256i vt8 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 128)));
+ const __m256i vt9 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 144)));
+ const __m256i vtA = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 160)));
+ const __m256i vtB = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 176)));
+ const __m256i vtC = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 192)));
+ const __m256i vtD = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 208)));
+ const __m256i vtE = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 224)));
+ const __m256i vtF = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 240)));
+
+ const __m256i vtable0 = vt0;
+ const __m256i vtable1 = _mm256_xor_si256(vt0, vt1);
+ const __m256i vtable2 = _mm256_xor_si256(vt1, vt2);
+ const __m256i vtable3 = _mm256_xor_si256(vt2, vt3);
+ const __m256i vtable4 = _mm256_xor_si256(vt3, vt4);
+ const __m256i vtable5 = _mm256_xor_si256(vt4, vt5);
+ const __m256i vtable6 = _mm256_xor_si256(vt5, vt6);
+ const __m256i vtable7 = _mm256_xor_si256(vt6, vt7);
+ const __m256i vtable8 = _mm256_xor_si256(_mm256_xor_si256(vt7, vt8), vtable0);
+ const __m256i vtable9 = _mm256_xor_si256(_mm256_xor_si256(vt8, vt9), vtable1);
+ const __m256i vtableA = _mm256_xor_si256(_mm256_xor_si256(vt9, vtA), vtable2);
+ const __m256i vtableB = _mm256_xor_si256(_mm256_xor_si256(vtA, vtB), vtable3);
+ const __m256i vtableC = _mm256_xor_si256(_mm256_xor_si256(vtB, vtC), vtable4);
+ const __m256i vtableD = _mm256_xor_si256(_mm256_xor_si256(vtC, vtD), vtable5);
+ const __m256i vtableE = _mm256_xor_si256(_mm256_xor_si256(vtD, vtE), vtable6);
+ const __m256i vtableF = _mm256_xor_si256(_mm256_xor_si256(vtE, vtF), vtable7);
+
+ const __m256i voffset = _mm256_set1_epi8(16);
+ for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
+ __m256i vx0 = _mm256_loadu_si256((const __m256i*) x);
+ x += 32;
+
+ __m256i vy0 = _mm256_shuffle_epi8(vtable0, vx0);
+
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable1, vx0));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable2, vx0));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable3, vx0));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable4, vx0));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable5, vx0));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable6, vx0));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable7, vx0));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable8, vx0));
+
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable9, vx0));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableA, vx0));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableB, vx0));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableC, vx0));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableD, vx0));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableE, vx0));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableF, vx0));
+
+ _mm256_storeu_si256((__m256i*) y, vy0);
+ y += 32;
+ }
+ for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+ x += 16;
+
+ __m128i vy = _mm_shuffle_epi8(_mm256_castsi256_si128(vtable0), vx);
+
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable1), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable2), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable3), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable4), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable5), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable6), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable7), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable8), vx));
+
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable9), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableA), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableB), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableC), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableD), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableE), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableF), vx));
+
+ _mm_storeu_si128((__m128i*) y, vy);
+ y += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+
+ __m128i vy = _mm_shuffle_epi8(_mm256_castsi256_si128(vtable0), vx);
+
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable1), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable2), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable3), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable4), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable5), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable6), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable7), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable8), vx));
+
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable9), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableA), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableB), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableC), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableD), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableE), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableF), vx));
+
+ if (n & (8 * sizeof(uint8_t))) {
+ _mm_storel_epi64((__m128i*) y, vy);
+ vy = _mm_unpackhi_epi64(vy, vy);
+ y += 8;
+ }
+ if (n & (4 * sizeof(uint8_t))) {
+ _mm_storeu_si32(y, vy);
+ vy = _mm_srli_epi64(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+ vy = _mm_srli_epi32(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ *y = (uint8_t) _mm_extract_epi8(vy, 0);
+ }
+ }
+}
diff --git a/src/x8-lut/gen/lut-avx2-x64.c b/src/x8-lut/gen/lut-avx2-x64.c
new file mode 100644
index 0000000..5162761
--- /dev/null
+++ b/src/x8-lut/gen/lut-avx2-x64.c
@@ -0,0 +1,234 @@
+// Auto-generated file. Do not edit!
+// Template: src/x8-lut/avx2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/lut.h>
+#include <xnnpack/common.h>
+
+
+void xnn_x8_lut_ukernel__avx2_x64(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
+{
+ assert(n != 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256i vt0 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) t));
+ const __m256i vt1 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 16)));
+ const __m256i vt2 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 32)));
+ const __m256i vt3 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 48)));
+ const __m256i vt4 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 64)));
+ const __m256i vt5 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 80)));
+ const __m256i vt6 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 96)));
+ const __m256i vt7 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 112)));
+ const __m256i vt8 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 128)));
+ const __m256i vt9 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 144)));
+ const __m256i vtA = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 160)));
+ const __m256i vtB = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 176)));
+ const __m256i vtC = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 192)));
+ const __m256i vtD = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 208)));
+ const __m256i vtE = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 224)));
+ const __m256i vtF = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 240)));
+
+ const __m256i vtable0 = vt0;
+ const __m256i vtable1 = _mm256_xor_si256(vt0, vt1);
+ const __m256i vtable2 = _mm256_xor_si256(vt1, vt2);
+ const __m256i vtable3 = _mm256_xor_si256(vt2, vt3);
+ const __m256i vtable4 = _mm256_xor_si256(vt3, vt4);
+ const __m256i vtable5 = _mm256_xor_si256(vt4, vt5);
+ const __m256i vtable6 = _mm256_xor_si256(vt5, vt6);
+ const __m256i vtable7 = _mm256_xor_si256(vt6, vt7);
+ const __m256i vtable8 = _mm256_xor_si256(_mm256_xor_si256(vt7, vt8), vtable0);
+ const __m256i vtable9 = _mm256_xor_si256(_mm256_xor_si256(vt8, vt9), vtable1);
+ const __m256i vtableA = _mm256_xor_si256(_mm256_xor_si256(vt9, vtA), vtable2);
+ const __m256i vtableB = _mm256_xor_si256(_mm256_xor_si256(vtA, vtB), vtable3);
+ const __m256i vtableC = _mm256_xor_si256(_mm256_xor_si256(vtB, vtC), vtable4);
+ const __m256i vtableD = _mm256_xor_si256(_mm256_xor_si256(vtC, vtD), vtable5);
+ const __m256i vtableE = _mm256_xor_si256(_mm256_xor_si256(vtD, vtE), vtable6);
+ const __m256i vtableF = _mm256_xor_si256(_mm256_xor_si256(vtE, vtF), vtable7);
+
+ const __m256i voffset = _mm256_set1_epi8(16);
+ for (; n >= 64 * sizeof(uint8_t); n -= 64 * sizeof(uint8_t)) {
+ __m256i vx0 = _mm256_loadu_si256((const __m256i*) x);
+ __m256i vx1 = _mm256_loadu_si256((const __m256i*) (x + 32));
+ x += 64;
+
+ __m256i vy0 = _mm256_shuffle_epi8(vtable0, vx0);
+ __m256i vy1 = _mm256_shuffle_epi8(vtable0, vx1);
+
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable1, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable1, vx1));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable2, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable2, vx1));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable3, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable3, vx1));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable4, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable4, vx1));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable5, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable5, vx1));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable6, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable6, vx1));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable7, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable7, vx1));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable8, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable8, vx1));
+
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable9, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable9, vx1));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableA, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableA, vx1));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableB, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableB, vx1));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableC, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableC, vx1));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableD, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableD, vx1));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableE, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableE, vx1));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableF, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableF, vx1));
+
+ _mm256_storeu_si256((__m256i*) y, vy0);
+ _mm256_storeu_si256((__m256i*) (y + 32), vy1);
+ y += 64;
+ }
+ for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+ x += 16;
+
+ __m128i vy = _mm_shuffle_epi8(_mm256_castsi256_si128(vtable0), vx);
+
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable1), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable2), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable3), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable4), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable5), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable6), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable7), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable8), vx));
+
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable9), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableA), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableB), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableC), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableD), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableE), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableF), vx));
+
+ _mm_storeu_si128((__m128i*) y, vy);
+ y += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+
+ __m128i vy = _mm_shuffle_epi8(_mm256_castsi256_si128(vtable0), vx);
+
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable1), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable2), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable3), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable4), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable5), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable6), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable7), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable8), vx));
+
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable9), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableA), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableB), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableC), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableD), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableE), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableF), vx));
+
+ if (n & (8 * sizeof(uint8_t))) {
+ _mm_storel_epi64((__m128i*) y, vy);
+ vy = _mm_unpackhi_epi64(vy, vy);
+ y += 8;
+ }
+ if (n & (4 * sizeof(uint8_t))) {
+ _mm_storeu_si32(y, vy);
+ vy = _mm_srli_epi64(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+ vy = _mm_srli_epi32(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ *y = (uint8_t) _mm_extract_epi8(vy, 0);
+ }
+ }
+}
diff --git a/src/x8-lut/gen/lut-avx2-x96.c b/src/x8-lut/gen/lut-avx2-x96.c
new file mode 100644
index 0000000..8455082
--- /dev/null
+++ b/src/x8-lut/gen/lut-avx2-x96.c
@@ -0,0 +1,267 @@
+// Auto-generated file. Do not edit!
+// Template: src/x8-lut/avx2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/lut.h>
+#include <xnnpack/common.h>
+
+
+void xnn_x8_lut_ukernel__avx2_x96(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
+{
+ assert(n != 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m256i vt0 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) t));
+ const __m256i vt1 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 16)));
+ const __m256i vt2 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 32)));
+ const __m256i vt3 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 48)));
+ const __m256i vt4 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 64)));
+ const __m256i vt5 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 80)));
+ const __m256i vt6 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 96)));
+ const __m256i vt7 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 112)));
+ const __m256i vt8 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 128)));
+ const __m256i vt9 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 144)));
+ const __m256i vtA = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 160)));
+ const __m256i vtB = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 176)));
+ const __m256i vtC = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 192)));
+ const __m256i vtD = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 208)));
+ const __m256i vtE = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 224)));
+ const __m256i vtF = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 240)));
+
+ const __m256i vtable0 = vt0;
+ const __m256i vtable1 = _mm256_xor_si256(vt0, vt1);
+ const __m256i vtable2 = _mm256_xor_si256(vt1, vt2);
+ const __m256i vtable3 = _mm256_xor_si256(vt2, vt3);
+ const __m256i vtable4 = _mm256_xor_si256(vt3, vt4);
+ const __m256i vtable5 = _mm256_xor_si256(vt4, vt5);
+ const __m256i vtable6 = _mm256_xor_si256(vt5, vt6);
+ const __m256i vtable7 = _mm256_xor_si256(vt6, vt7);
+ const __m256i vtable8 = _mm256_xor_si256(_mm256_xor_si256(vt7, vt8), vtable0);
+ const __m256i vtable9 = _mm256_xor_si256(_mm256_xor_si256(vt8, vt9), vtable1);
+ const __m256i vtableA = _mm256_xor_si256(_mm256_xor_si256(vt9, vtA), vtable2);
+ const __m256i vtableB = _mm256_xor_si256(_mm256_xor_si256(vtA, vtB), vtable3);
+ const __m256i vtableC = _mm256_xor_si256(_mm256_xor_si256(vtB, vtC), vtable4);
+ const __m256i vtableD = _mm256_xor_si256(_mm256_xor_si256(vtC, vtD), vtable5);
+ const __m256i vtableE = _mm256_xor_si256(_mm256_xor_si256(vtD, vtE), vtable6);
+ const __m256i vtableF = _mm256_xor_si256(_mm256_xor_si256(vtE, vtF), vtable7);
+
+ const __m256i voffset = _mm256_set1_epi8(16);
+ for (; n >= 96 * sizeof(uint8_t); n -= 96 * sizeof(uint8_t)) {
+ __m256i vx0 = _mm256_loadu_si256((const __m256i*) x);
+ __m256i vx1 = _mm256_loadu_si256((const __m256i*) (x + 32));
+ __m256i vx2 = _mm256_loadu_si256((const __m256i*) (x + 64));
+ x += 96;
+
+ __m256i vy0 = _mm256_shuffle_epi8(vtable0, vx0);
+ __m256i vy1 = _mm256_shuffle_epi8(vtable0, vx1);
+ __m256i vy2 = _mm256_shuffle_epi8(vtable0, vx2);
+
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vx2 = _mm256_sub_epi8(vx2, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable1, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable1, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable1, vx2));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vx2 = _mm256_sub_epi8(vx2, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable2, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable2, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable2, vx2));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vx2 = _mm256_sub_epi8(vx2, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable3, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable3, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable3, vx2));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vx2 = _mm256_sub_epi8(vx2, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable4, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable4, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable4, vx2));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vx2 = _mm256_sub_epi8(vx2, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable5, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable5, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable5, vx2));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vx2 = _mm256_sub_epi8(vx2, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable6, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable6, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable6, vx2));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vx2 = _mm256_sub_epi8(vx2, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable7, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable7, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable7, vx2));
+ vx0 = _mm256_sub_epi8(vx0, voffset);
+ vx1 = _mm256_sub_epi8(vx1, voffset);
+ vx2 = _mm256_sub_epi8(vx2, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable8, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable8, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable8, vx2));
+
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vx2 = _mm256_subs_epi8(vx2, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable9, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable9, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable9, vx2));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vx2 = _mm256_subs_epi8(vx2, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableA, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableA, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableA, vx2));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vx2 = _mm256_subs_epi8(vx2, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableB, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableB, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableB, vx2));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vx2 = _mm256_subs_epi8(vx2, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableC, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableC, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableC, vx2));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vx2 = _mm256_subs_epi8(vx2, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableD, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableD, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableD, vx2));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vx2 = _mm256_subs_epi8(vx2, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableE, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableE, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableE, vx2));
+ vx0 = _mm256_subs_epi8(vx0, voffset);
+ vx1 = _mm256_subs_epi8(vx1, voffset);
+ vx2 = _mm256_subs_epi8(vx2, voffset);
+ vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableF, vx0));
+ vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableF, vx1));
+ vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableF, vx2));
+
+ _mm256_storeu_si256((__m256i*) y, vy0);
+ _mm256_storeu_si256((__m256i*) (y + 32), vy1);
+ _mm256_storeu_si256((__m256i*) (y + 64), vy2);
+ y += 96;
+ }
+ for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+ x += 16;
+
+ __m128i vy = _mm_shuffle_epi8(_mm256_castsi256_si128(vtable0), vx);
+
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable1), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable2), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable3), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable4), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable5), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable6), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable7), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable8), vx));
+
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable9), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableA), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableB), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableC), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableD), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableE), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableF), vx));
+
+ _mm_storeu_si128((__m128i*) y, vy);
+ y += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+
+ __m128i vy = _mm_shuffle_epi8(_mm256_castsi256_si128(vtable0), vx);
+
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable1), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable2), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable3), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable4), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable5), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable6), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable7), vx));
+ vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable8), vx));
+
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable9), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableA), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableB), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableC), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableD), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableE), vx));
+ vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableF), vx));
+
+ if (n & (8 * sizeof(uint8_t))) {
+ _mm_storel_epi64((__m128i*) y, vy);
+ vy = _mm_unpackhi_epi64(vy, vy);
+ y += 8;
+ }
+ if (n & (4 * sizeof(uint8_t))) {
+ _mm_storeu_si32(y, vy);
+ vy = _mm_srli_epi64(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+ vy = _mm_srli_epi32(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ *y = (uint8_t) _mm_extract_epi8(vy, 0);
+ }
+ }
+}
diff --git a/src/x8-lut/gen/lut-ssse3-x16.c b/src/x8-lut/gen/lut-ssse3-x16.c
new file mode 100644
index 0000000..0a39326
--- /dev/null
+++ b/src/x8-lut/gen/lut-ssse3-x16.c
@@ -0,0 +1,161 @@
+// Auto-generated file. Do not edit!
+// Template: src/x8-lut/ssse3.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <tmmintrin.h>
+
+#include <xnnpack/lut.h>
+#include <xnnpack/common.h>
+
+
+void xnn_x8_lut_ukernel__ssse3_x16(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
+{
+ assert(n != 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m128i vt0 = _mm_load_si128((const __m128i*) t);
+ const __m128i vt1 = _mm_load_si128((const __m128i*) (t + 16));
+ const __m128i vt2 = _mm_load_si128((const __m128i*) (t + 32));
+ const __m128i vt3 = _mm_load_si128((const __m128i*) (t + 48));
+ const __m128i vt4 = _mm_load_si128((const __m128i*) (t + 64));
+ const __m128i vt5 = _mm_load_si128((const __m128i*) (t + 80));
+ const __m128i vt6 = _mm_load_si128((const __m128i*) (t + 96));
+ const __m128i vt7 = _mm_load_si128((const __m128i*) (t + 112));
+ const __m128i vt8 = _mm_load_si128((const __m128i*) (t + 128));
+ const __m128i vt9 = _mm_load_si128((const __m128i*) (t + 144));
+ const __m128i vtA = _mm_load_si128((const __m128i*) (t + 160));
+ const __m128i vtB = _mm_load_si128((const __m128i*) (t + 176));
+ const __m128i vtC = _mm_load_si128((const __m128i*) (t + 192));
+ const __m128i vtD = _mm_load_si128((const __m128i*) (t + 208));
+ const __m128i vtE = _mm_load_si128((const __m128i*) (t + 224));
+ const __m128i vtF = _mm_load_si128((const __m128i*) (t + 240));
+
+ const __m128i vtable0 = vt0;
+ const __m128i vtable1 = _mm_xor_si128(vt0, vt1);
+ const __m128i vtable2 = _mm_xor_si128(vt1, vt2);
+ const __m128i vtable3 = _mm_xor_si128(vt2, vt3);
+ const __m128i vtable4 = _mm_xor_si128(vt3, vt4);
+ const __m128i vtable5 = _mm_xor_si128(vt4, vt5);
+ const __m128i vtable6 = _mm_xor_si128(vt5, vt6);
+ const __m128i vtable7 = _mm_xor_si128(vt6, vt7);
+ const __m128i vtable8 = _mm_xor_si128(_mm_xor_si128(vt7, vt8), vtable0);
+ const __m128i vtable9 = _mm_xor_si128(_mm_xor_si128(vt8, vt9), vtable1);
+ const __m128i vtableA = _mm_xor_si128(_mm_xor_si128(vt9, vtA), vtable2);
+ const __m128i vtableB = _mm_xor_si128(_mm_xor_si128(vtA, vtB), vtable3);
+ const __m128i vtableC = _mm_xor_si128(_mm_xor_si128(vtB, vtC), vtable4);
+ const __m128i vtableD = _mm_xor_si128(_mm_xor_si128(vtC, vtD), vtable5);
+ const __m128i vtableE = _mm_xor_si128(_mm_xor_si128(vtD, vtE), vtable6);
+ const __m128i vtableF = _mm_xor_si128(_mm_xor_si128(vtE, vtF), vtable7);
+
+ const __m128i voffset = _mm_set1_epi8(16);
+ for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+ x += 16;
+
+ __m128i vy = _mm_shuffle_epi8(vtable0, vx);
+
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable1, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable2, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable3, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable4, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable5, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable6, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable7, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable8, vx));
+
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable9, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableA, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableB, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableC, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableD, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableE, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableF, vx));
+
+ _mm_storeu_si128((__m128i*) y, vy);
+ y += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+
+ __m128i vy = _mm_shuffle_epi8(vtable0, vx);
+
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable1, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable2, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable3, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable4, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable5, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable6, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable7, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable8, vx));
+
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable9, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableA, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableB, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableC, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableD, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableE, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableF, vx));
+
+ if (n & (8 * sizeof(uint8_t))) {
+ _mm_storel_epi64((__m128i*) y, vy);
+ vy = _mm_unpackhi_epi64(vy, vy);
+ y += 8;
+ }
+ if (n & (4 * sizeof(uint8_t))) {
+ *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+ vy = _mm_srli_epi64(vy, 32);
+ y += 4;
+ }
+ uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
+ if (n & (2 * sizeof(uint8_t))) {
+ *((uint16_t*) y) = (uint16_t) vy_lo;
+ vy_lo >>= 16;
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ *y = (uint8_t) vy_lo;
+ }
+ }
+}
diff --git a/src/x8-lut/gen/lut-ssse3-x32.c b/src/x8-lut/gen/lut-ssse3-x32.c
new file mode 100644
index 0000000..54beb23
--- /dev/null
+++ b/src/x8-lut/gen/lut-ssse3-x32.c
@@ -0,0 +1,235 @@
+// Auto-generated file. Do not edit!
+// Template: src/x8-lut/ssse3.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <tmmintrin.h>
+
+#include <xnnpack/lut.h>
+#include <xnnpack/common.h>
+
+
+void xnn_x8_lut_ukernel__ssse3_x32(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
+{
+ assert(n != 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m128i vt0 = _mm_load_si128((const __m128i*) t);
+ const __m128i vt1 = _mm_load_si128((const __m128i*) (t + 16));
+ const __m128i vt2 = _mm_load_si128((const __m128i*) (t + 32));
+ const __m128i vt3 = _mm_load_si128((const __m128i*) (t + 48));
+ const __m128i vt4 = _mm_load_si128((const __m128i*) (t + 64));
+ const __m128i vt5 = _mm_load_si128((const __m128i*) (t + 80));
+ const __m128i vt6 = _mm_load_si128((const __m128i*) (t + 96));
+ const __m128i vt7 = _mm_load_si128((const __m128i*) (t + 112));
+ const __m128i vt8 = _mm_load_si128((const __m128i*) (t + 128));
+ const __m128i vt9 = _mm_load_si128((const __m128i*) (t + 144));
+ const __m128i vtA = _mm_load_si128((const __m128i*) (t + 160));
+ const __m128i vtB = _mm_load_si128((const __m128i*) (t + 176));
+ const __m128i vtC = _mm_load_si128((const __m128i*) (t + 192));
+ const __m128i vtD = _mm_load_si128((const __m128i*) (t + 208));
+ const __m128i vtE = _mm_load_si128((const __m128i*) (t + 224));
+ const __m128i vtF = _mm_load_si128((const __m128i*) (t + 240));
+
+ const __m128i vtable0 = vt0;
+ const __m128i vtable1 = _mm_xor_si128(vt0, vt1);
+ const __m128i vtable2 = _mm_xor_si128(vt1, vt2);
+ const __m128i vtable3 = _mm_xor_si128(vt2, vt3);
+ const __m128i vtable4 = _mm_xor_si128(vt3, vt4);
+ const __m128i vtable5 = _mm_xor_si128(vt4, vt5);
+ const __m128i vtable6 = _mm_xor_si128(vt5, vt6);
+ const __m128i vtable7 = _mm_xor_si128(vt6, vt7);
+ const __m128i vtable8 = _mm_xor_si128(_mm_xor_si128(vt7, vt8), vtable0);
+ const __m128i vtable9 = _mm_xor_si128(_mm_xor_si128(vt8, vt9), vtable1);
+ const __m128i vtableA = _mm_xor_si128(_mm_xor_si128(vt9, vtA), vtable2);
+ const __m128i vtableB = _mm_xor_si128(_mm_xor_si128(vtA, vtB), vtable3);
+ const __m128i vtableC = _mm_xor_si128(_mm_xor_si128(vtB, vtC), vtable4);
+ const __m128i vtableD = _mm_xor_si128(_mm_xor_si128(vtC, vtD), vtable5);
+ const __m128i vtableE = _mm_xor_si128(_mm_xor_si128(vtD, vtE), vtable6);
+ const __m128i vtableF = _mm_xor_si128(_mm_xor_si128(vtE, vtF), vtable7);
+
+ const __m128i voffset = _mm_set1_epi8(16);
+ for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
+ __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
+ __m128i vx1 = _mm_loadu_si128((const __m128i*) (x + 16));
+ x += 32;
+
+ __m128i vy0 = _mm_shuffle_epi8(vtable0, vx0);
+ __m128i vy1 = _mm_shuffle_epi8(vtable0, vx1);
+
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable1, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable1, vx1));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable2, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable2, vx1));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable3, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable3, vx1));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable4, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable4, vx1));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable5, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable5, vx1));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable6, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable6, vx1));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable7, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable7, vx1));
+ vx0 = _mm_sub_epi8(vx0, voffset);
+ vx1 = _mm_sub_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable8, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable8, vx1));
+
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable9, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable9, vx1));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableA, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableA, vx1));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableB, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableB, vx1));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableC, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableC, vx1));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableD, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableD, vx1));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableE, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableE, vx1));
+ vx0 = _mm_subs_epi8(vx0, voffset);
+ vx1 = _mm_subs_epi8(vx1, voffset);
+ vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableF, vx0));
+ vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableF, vx1));
+
+ _mm_storeu_si128((__m128i*) y, vy0);
+ _mm_storeu_si128((__m128i*) (y + 16), vy1);
+ y += 32;
+ }
+ for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+ x += 16;
+
+ __m128i vy = _mm_shuffle_epi8(vtable0, vx);
+
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable1, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable2, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable3, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable4, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable5, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable6, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable7, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable8, vx));
+
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable9, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableA, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableB, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableC, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableD, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableE, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableF, vx));
+
+ _mm_storeu_si128((__m128i*) y, vy);
+ y += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+
+ __m128i vy = _mm_shuffle_epi8(vtable0, vx);
+
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable1, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable2, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable3, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable4, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable5, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable6, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable7, vx));
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable8, vx));
+
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable9, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableA, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableB, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableC, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableD, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableE, vx));
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableF, vx));
+
+ if (n & (8 * sizeof(uint8_t))) {
+ _mm_storel_epi64((__m128i*) y, vy);
+ vy = _mm_unpackhi_epi64(vy, vy);
+ y += 8;
+ }
+ if (n & (4 * sizeof(uint8_t))) {
+ *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+ vy = _mm_srli_epi64(vy, 32);
+ y += 4;
+ }
+ uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
+ if (n & (2 * sizeof(uint8_t))) {
+ *((uint16_t*) y) = (uint16_t) vy_lo;
+ vy_lo >>= 16;
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ *y = (uint8_t) vy_lo;
+ }
+ }
+}
diff --git a/src/x8-lut/ssse3.c.in b/src/x8-lut/ssse3.c.in
new file mode 100644
index 0000000..a72652c
--- /dev/null
+++ b/src/x8-lut/ssse3.c.in
@@ -0,0 +1,132 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert BATCH_TILE >= 16
+$assert BATCH_TILE % 16 == 0
+$SIMD_TILE = BATCH_TILE // 16
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+$if AVX:
+ #include <immintrin.h>
+$else:
+ #include <tmmintrin.h>
+
+#include <xnnpack/lut.h>
+#include <xnnpack/common.h>
+
+
+void xnn_x8_lut_ukernel__${"avx" if AVX else "ssse3"}_x${BATCH_TILE}(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
+{
+ assert(n != 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const __m128i vt0 = _mm_load_si128((const __m128i*) t);
+ $for T in range(1, 16):
+ const __m128i vt${ABC[T]} = _mm_load_si128((const __m128i*) (t + ${T * 16}));
+
+ const __m128i vtable0 = vt0;
+ $for T in range(1, 8):
+ const __m128i vtable${ABC[T]} = _mm_xor_si128(vt${ABC[T-1]}, vt${ABC[T]});
+ $for T in range(8, 16):
+ const __m128i vtable${ABC[T]} = _mm_xor_si128(_mm_xor_si128(vt${ABC[T-1]}, vt${ABC[T]}), vtable${ABC[T-8]});
+
+ const __m128i voffset = _mm_set1_epi8(16);
+ $if BATCH_TILE > 16:
+ for (; n >= ${BATCH_TILE} * sizeof(uint8_t); n -= ${BATCH_TILE} * sizeof(uint8_t)) {
+ __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
+ $for N in range(1, SIMD_TILE):
+ __m128i vx${N} = _mm_loadu_si128((const __m128i*) (x + ${N * 16}));
+ x += ${BATCH_TILE};
+
+ $for N in range(SIMD_TILE):
+ __m128i vy${N} = _mm_shuffle_epi8(vtable0, vx${N});
+
+ $for T in range(1, 9):
+ $for N in range(SIMD_TILE):
+ vx${N} = _mm_sub_epi8(vx${N}, voffset);
+ $for N in range(SIMD_TILE):
+ vy${N} = _mm_xor_si128(vy${N}, _mm_shuffle_epi8(vtable${ABC[T]}, vx${N}));
+
+ $for T in range(9, 16):
+ $for N in range(SIMD_TILE):
+ vx${N} = _mm_subs_epi8(vx${N}, voffset);
+ $for N in range(SIMD_TILE):
+ vy${N} = _mm_xor_si128(vy${N}, _mm_shuffle_epi8(vtable${ABC[T]}, vx${N}));
+
+ _mm_storeu_si128((__m128i*) y, vy0);
+ $for N in range(1, SIMD_TILE):
+ _mm_storeu_si128((__m128i*) (y + ${N * 16}), vy${N});
+ y += ${BATCH_TILE};
+ }
+ for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+ x += 16;
+
+ __m128i vy = _mm_shuffle_epi8(vtable0, vx);
+
+ $for T in range(1, 9):
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable${ABC[T]}, vx));
+
+ $for T in range(9, 16):
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable${ABC[T]}, vx));
+
+ _mm_storeu_si128((__m128i*) y, vy);
+ y += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ __m128i vx = _mm_loadu_si128((const __m128i*) x);
+
+ __m128i vy = _mm_shuffle_epi8(vtable0, vx);
+
+ $for T in range(1, 9):
+ vx = _mm_sub_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable${ABC[T]}, vx));
+
+ $for T in range(9, 16):
+ vx = _mm_subs_epi8(vx, voffset);
+ vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable${ABC[T]}, vx));
+
+ if (n & (8 * sizeof(uint8_t))) {
+ _mm_storel_epi64((__m128i*) y, vy);
+ vy = _mm_unpackhi_epi64(vy, vy);
+ y += 8;
+ }
+ if (n & (4 * sizeof(uint8_t))) {
+ $if AVX:
+ _mm_storeu_si32(y, vy);
+ $else:
+ *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
+ vy = _mm_srli_epi64(vy, 32);
+ y += 4;
+ }
+ $if AVX:
+ if (n & (2 * sizeof(uint8_t))) {
+ *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
+ vy = _mm_srli_epi32(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ *y = (uint8_t) _mm_extract_epi8(vy, 0);
+ }
+ $else:
+ uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
+ if (n & (2 * sizeof(uint8_t))) {
+ *((uint16_t*) y) = (uint16_t) vy_lo;
+ vy_lo >>= 16;
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ *y = (uint8_t) vy_lo;
+ }
+ }
+}
diff --git a/src/xnnpack/lut.h b/src/xnnpack/lut.h
index 0850efc..b15cd93 100644
--- a/src/xnnpack/lut.h
+++ b/src/xnnpack/lut.h
@@ -37,6 +37,19 @@
DECLARE_X8_LUT_UKERNEL_FUNCTION(xnn_x8_lut_ukernel__neon_tbx128x4_x48)
DECLARE_X8_LUT_UKERNEL_FUNCTION(xnn_x8_lut_ukernel__neon_tbx128x4_x64)
+DECLARE_X8_LUT_UKERNEL_FUNCTION(xnn_x8_lut_ukernel__ssse3_x16)
+DECLARE_X8_LUT_UKERNEL_FUNCTION(xnn_x8_lut_ukernel__ssse3_x32)
+
+DECLARE_X8_LUT_UKERNEL_FUNCTION(xnn_x8_lut_ukernel__avx_x16)
+DECLARE_X8_LUT_UKERNEL_FUNCTION(xnn_x8_lut_ukernel__avx_x32)
+DECLARE_X8_LUT_UKERNEL_FUNCTION(xnn_x8_lut_ukernel__avx_x48)
+DECLARE_X8_LUT_UKERNEL_FUNCTION(xnn_x8_lut_ukernel__avx_x64)
+
+DECLARE_X8_LUT_UKERNEL_FUNCTION(xnn_x8_lut_ukernel__avx2_x32)
+DECLARE_X8_LUT_UKERNEL_FUNCTION(xnn_x8_lut_ukernel__avx2_x64)
+DECLARE_X8_LUT_UKERNEL_FUNCTION(xnn_x8_lut_ukernel__avx2_x96)
+DECLARE_X8_LUT_UKERNEL_FUNCTION(xnn_x8_lut_ukernel__avx2_x128)
+
#define DECLARE_U8_LUT32NORM_UKERNEL_FUNCTION(fn_name) \
XNN_INTERNAL void fn_name( \
diff --git a/test/x8-lut.cc b/test/x8-lut.cc
index a101c4e..f316650 100644
--- a/test/x8-lut.cc
+++ b/test/x8-lut.cc
@@ -382,3 +382,473 @@
}
}
#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(X8_LUT__SSSE3_X16, batch_eq_16) {
+ TEST_REQUIRES_X86_SSSE3;
+ LUTMicrokernelTester()
+ .batch_size(16)
+ .Test(xnn_x8_lut_ukernel__ssse3_x16);
+ }
+
+ TEST(X8_LUT__SSSE3_X16, batch_div_16) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__ssse3_x16);
+ }
+ }
+
+ TEST(X8_LUT__SSSE3_X16, batch_lt_16) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__ssse3_x16);
+ }
+ }
+
+ TEST(X8_LUT__SSSE3_X16, batch_gt_16) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__ssse3_x16);
+ }
+ }
+
+ TEST(X8_LUT__SSSE3_X16, inplace) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_x8_lut_ukernel__ssse3_x16);
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(X8_LUT__SSSE3_X32, batch_eq_32) {
+ TEST_REQUIRES_X86_SSSE3;
+ LUTMicrokernelTester()
+ .batch_size(32)
+ .Test(xnn_x8_lut_ukernel__ssse3_x32);
+ }
+
+ TEST(X8_LUT__SSSE3_X32, batch_div_32) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__ssse3_x32);
+ }
+ }
+
+ TEST(X8_LUT__SSSE3_X32, batch_lt_32) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__ssse3_x32);
+ }
+ }
+
+ TEST(X8_LUT__SSSE3_X32, batch_gt_32) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__ssse3_x32);
+ }
+ }
+
+ TEST(X8_LUT__SSSE3_X32, inplace) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_x8_lut_ukernel__ssse3_x32);
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(X8_LUT__AVX_X16, batch_eq_16) {
+ TEST_REQUIRES_X86_AVX;
+ LUTMicrokernelTester()
+ .batch_size(16)
+ .Test(xnn_x8_lut_ukernel__avx_x16);
+ }
+
+ TEST(X8_LUT__AVX_X16, batch_div_16) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx_x16);
+ }
+ }
+
+ TEST(X8_LUT__AVX_X16, batch_lt_16) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx_x16);
+ }
+ }
+
+ TEST(X8_LUT__AVX_X16, batch_gt_16) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx_x16);
+ }
+ }
+
+ TEST(X8_LUT__AVX_X16, inplace) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_x8_lut_ukernel__avx_x16);
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(X8_LUT__AVX_X32, batch_eq_32) {
+ TEST_REQUIRES_X86_AVX;
+ LUTMicrokernelTester()
+ .batch_size(32)
+ .Test(xnn_x8_lut_ukernel__avx_x32);
+ }
+
+ TEST(X8_LUT__AVX_X32, batch_div_32) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx_x32);
+ }
+ }
+
+ TEST(X8_LUT__AVX_X32, batch_lt_32) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx_x32);
+ }
+ }
+
+ TEST(X8_LUT__AVX_X32, batch_gt_32) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx_x32);
+ }
+ }
+
+ TEST(X8_LUT__AVX_X32, inplace) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_x8_lut_ukernel__avx_x32);
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(X8_LUT__AVX_X48, batch_eq_48) {
+ TEST_REQUIRES_X86_AVX;
+ LUTMicrokernelTester()
+ .batch_size(48)
+ .Test(xnn_x8_lut_ukernel__avx_x48);
+ }
+
+ TEST(X8_LUT__AVX_X48, batch_div_48) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 96; batch_size < 480; batch_size += 48) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx_x48);
+ }
+ }
+
+ TEST(X8_LUT__AVX_X48, batch_lt_48) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size < 48; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx_x48);
+ }
+ }
+
+ TEST(X8_LUT__AVX_X48, batch_gt_48) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 49; batch_size < 96; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx_x48);
+ }
+ }
+
+ TEST(X8_LUT__AVX_X48, inplace) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 240; batch_size += 47) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_x8_lut_ukernel__avx_x48);
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(X8_LUT__AVX_X64, batch_eq_64) {
+ TEST_REQUIRES_X86_AVX;
+ LUTMicrokernelTester()
+ .batch_size(64)
+ .Test(xnn_x8_lut_ukernel__avx_x64);
+ }
+
+ TEST(X8_LUT__AVX_X64, batch_div_64) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 128; batch_size < 640; batch_size += 64) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx_x64);
+ }
+ }
+
+ TEST(X8_LUT__AVX_X64, batch_lt_64) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size < 64; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx_x64);
+ }
+ }
+
+ TEST(X8_LUT__AVX_X64, batch_gt_64) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 65; batch_size < 128; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx_x64);
+ }
+ }
+
+ TEST(X8_LUT__AVX_X64, inplace) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t batch_size = 1; batch_size <= 320; batch_size += 63) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_x8_lut_ukernel__avx_x64);
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(X8_LUT__AVX2_X32, batch_eq_32) {
+ TEST_REQUIRES_X86_AVX2;
+ LUTMicrokernelTester()
+ .batch_size(32)
+ .Test(xnn_x8_lut_ukernel__avx2_x32);
+ }
+
+ TEST(X8_LUT__AVX2_X32, batch_div_32) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx2_x32);
+ }
+ }
+
+ TEST(X8_LUT__AVX2_X32, batch_lt_32) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx2_x32);
+ }
+ }
+
+ TEST(X8_LUT__AVX2_X32, batch_gt_32) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx2_x32);
+ }
+ }
+
+ TEST(X8_LUT__AVX2_X32, inplace) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_x8_lut_ukernel__avx2_x32);
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(X8_LUT__AVX2_X64, batch_eq_64) {
+ TEST_REQUIRES_X86_AVX2;
+ LUTMicrokernelTester()
+ .batch_size(64)
+ .Test(xnn_x8_lut_ukernel__avx2_x64);
+ }
+
+ TEST(X8_LUT__AVX2_X64, batch_div_64) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 128; batch_size < 640; batch_size += 64) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx2_x64);
+ }
+ }
+
+ TEST(X8_LUT__AVX2_X64, batch_lt_64) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size < 64; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx2_x64);
+ }
+ }
+
+ TEST(X8_LUT__AVX2_X64, batch_gt_64) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 65; batch_size < 128; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx2_x64);
+ }
+ }
+
+ TEST(X8_LUT__AVX2_X64, inplace) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size <= 320; batch_size += 63) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_x8_lut_ukernel__avx2_x64);
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(X8_LUT__AVX2_X96, batch_eq_96) {
+ TEST_REQUIRES_X86_AVX2;
+ LUTMicrokernelTester()
+ .batch_size(96)
+ .Test(xnn_x8_lut_ukernel__avx2_x96);
+ }
+
+ TEST(X8_LUT__AVX2_X96, batch_div_96) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 192; batch_size < 960; batch_size += 96) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx2_x96);
+ }
+ }
+
+ TEST(X8_LUT__AVX2_X96, batch_lt_96) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size < 96; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx2_x96);
+ }
+ }
+
+ TEST(X8_LUT__AVX2_X96, batch_gt_96) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 97; batch_size < 192; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx2_x96);
+ }
+ }
+
+ TEST(X8_LUT__AVX2_X96, inplace) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size <= 480; batch_size += 95) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_x8_lut_ukernel__avx2_x96);
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(X8_LUT__AVX2_X128, batch_eq_128) {
+ TEST_REQUIRES_X86_AVX2;
+ LUTMicrokernelTester()
+ .batch_size(128)
+ .Test(xnn_x8_lut_ukernel__avx2_x128);
+ }
+
+ TEST(X8_LUT__AVX2_X128, batch_div_128) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 256; batch_size < 1280; batch_size += 128) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx2_x128);
+ }
+ }
+
+ TEST(X8_LUT__AVX2_X128, batch_lt_128) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size < 128; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx2_x128);
+ }
+ }
+
+ TEST(X8_LUT__AVX2_X128, batch_gt_128) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 129; batch_size < 256; batch_size++) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_x8_lut_ukernel__avx2_x128);
+ }
+ }
+
+ TEST(X8_LUT__AVX2_X128, inplace) {
+ TEST_REQUIRES_X86_AVX2;
+ for (size_t batch_size = 1; batch_size <= 640; batch_size += 127) {
+ LUTMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_x8_lut_ukernel__avx2_x128);
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/test/x8-lut.yaml b/test/x8-lut.yaml
index 2947a9c..bc6ba0b 100644
--- a/test/x8-lut.yaml
+++ b/test/x8-lut.yaml
@@ -20,3 +20,13 @@
- name: xnn_x8_lut_ukernel__neon_tbx128x4_x64
arch:
- aarch64
+- name: xnn_x8_lut_ukernel__ssse3_x16
+- name: xnn_x8_lut_ukernel__ssse3_x32
+- name: xnn_x8_lut_ukernel__avx_x16
+- name: xnn_x8_lut_ukernel__avx_x32
+- name: xnn_x8_lut_ukernel__avx_x48
+- name: xnn_x8_lut_ukernel__avx_x64
+- name: xnn_x8_lut_ukernel__avx2_x32
+- name: xnn_x8_lut_ukernel__avx2_x64
+- name: xnn_x8_lut_ukernel__avx2_x96
+- name: xnn_x8_lut_ukernel__avx2_x128